Remove the legacy KPL directory as well as the preprocessing for keras.
Also update related build targets and tests to not use them.
PiperOrigin-RevId: 406052251
Change-Id: I8e4a0a7747434db997d3e34f82d22cf4423dcd5e
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 3959e3d..7dea85f 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -40,7 +40,6 @@
"//tensorflow/python/keras/mixed_precision:mixed_precision_experimental",
"//tensorflow/python/keras/optimizer_v2",
"//tensorflow/python/keras/premade",
- "//tensorflow/python/keras/preprocessing",
"//tensorflow/python/keras/saving",
"//tensorflow/python/keras/utils",
"//tensorflow/python/keras/wrappers",
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index b6e9279..f80301e 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -51,9 +51,9 @@
":recurrent_v2",
":rnn_cell_wrapper_v2",
":wrappers",
+ "//tensorflow/python/keras/engine",
"//tensorflow/python/keras/feature_column",
"//tensorflow/python/keras/layers/normalization",
- "//tensorflow/python/keras/layers/preprocessing",
"//tensorflow/python/keras/premade",
"//tensorflow/python/keras/utils:tf_utils",
],
@@ -881,15 +881,3 @@
"@absl_py//absl/testing:parameterized",
],
)
-
-tf_py_test(
- name = "layers_test",
- size = "small",
- srcs = ["layers_test.py"],
- python_version = "PY3",
- deps = [
- ":layers",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:tf2",
- ],
-)
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 4223f7d..7671c02 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -25,29 +25,6 @@
from tensorflow.python.keras.engine.base_layer import Layer
from tensorflow.python.keras.engine.base_preprocessing_layer import PreprocessingLayer
-# Image preprocessing layers.
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import CenterCrop
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomCrop
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomFlip
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomContrast
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomHeight
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomRotation
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomTranslation
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomWidth
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import RandomZoom
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Resizing
-from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Rescaling
-
-# Preprocessing layers.
-from tensorflow.python.keras.layers.preprocessing.category_crossing import CategoryCrossing
-from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding
-from tensorflow.python.keras.layers.preprocessing.discretization import Discretization
-from tensorflow.python.keras.layers.preprocessing.hashing import Hashing
-from tensorflow.python.keras.layers.preprocessing.integer_lookup import IntegerLookup
-from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
-from tensorflow.python.keras.layers.preprocessing.string_lookup import StringLookup
-from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization
-
# Advanced activations.
from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
from tensorflow.python.keras.layers.advanced_activations import PReLU
diff --git a/tensorflow/python/keras/layers/layers_test.py b/tensorflow/python/keras/layers/layers_test.py
deleted file mode 100644
index 60349b2..0000000
--- a/tensorflow/python/keras/layers/layers_test.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=g-classes-have-attributes
-"""Tests for layers.__init__."""
-
-from tensorflow.python import tf2
-from tensorflow.python.keras import layers
-from tensorflow.python.platform import test
-
-
-class LayersTest(test.TestCase):
-
- def test_keras_private_symbol(self):
- if tf2.enabled():
- normalization_parent = layers.Normalization.__module__.split('.')[-1]
- self.assertEqual('normalization', normalization_parent)
- self.assertTrue(layers.BatchNormalization._USE_V2_BEHAVIOR)
- else:
- self.assertFalse(layers.BatchNormalization._USE_V2_BEHAVIOR)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
deleted file mode 100644
index 4459ca3..0000000
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ /dev/null
@@ -1,757 +0,0 @@
-# Description:
-# Contains the Keras preprocess layers (internal TensorFlow version).
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
-
-package(
- default_visibility = [
- "//tensorflow/python/keras:__subpackages__",
- "//tensorflow/tools/pip_package:__pkg__",
- ],
- licenses = ["notice"],
-)
-
-filegroup(
- name = "all_py_srcs",
- srcs = glob(["*.py"]),
- visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
-)
-
-py_library(
- name = "preprocessing",
- srcs = [
- "__init__.py",
- ],
- srcs_version = "PY3",
- deps = [
- ":category_crossing",
- ":discretization",
- ":hashing",
- ":image_preprocessing",
- ":integer_lookup",
- ":normalization",
- ":preprocessing_stage",
- ":preprocessing_test_utils",
- ":reduction",
- ":string_lookup",
- ":text_vectorization",
- ],
-)
-
-py_library(
- name = "discretization",
- srcs = [
- "discretization.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:boosted_trees_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:resources",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/utils:tf_utils",
- "//tensorflow/python/ops/parallel_for:control_flow_ops",
- "//tensorflow/python/ops/ragged:ragged_functional_ops",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "category_crossing",
- srcs = [
- "category_crossing.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:sparse_ops",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/utils:tf_utils",
- "//tensorflow/python/ops/ragged:ragged_array_ops",
- "//tensorflow/python/ops/ragged:ragged_tensor",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "hashing",
- srcs = [
- "hashing.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:sparse_ops",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:string_ops",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python:tensor_util",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/ops/ragged:ragged_tensor",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "image_preprocessing",
- srcs = [
- "image_preprocessing.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:check_ops",
- "//tensorflow/python:control_flow_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:image_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:stateful_random_ops",
- "//tensorflow/python:stateless_random_ops",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_util",
- "//tensorflow/python:variables",
- "//tensorflow/python/compat",
- "//tensorflow/python/eager:context",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/engine:input_spec",
- "//tensorflow/python/keras/preprocessing:image",
- "//tensorflow/python/keras/utils:control_flow_util",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "index_lookup",
- srcs = [
- "index_lookup.py",
- ],
- srcs_version = "PY3",
- deps = [
- ":category_encoding",
- ":table_utils",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:lookup_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python:util",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/engine",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "normalization",
- srcs = [
- "normalization.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:init_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:util",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "integer_lookup",
- srcs = [
- "integer_lookup.py",
- ],
- srcs_version = "PY3",
- deps = [
- ":index_lookup",
- ":table_utils",
- "//tensorflow/python:dtypes",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/util:tf_export",
- ],
-)
-
-py_library(
- name = "table_utils",
- srcs = [
- "table_utils.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:platform",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:string_ops",
- "//tensorflow/python/keras/utils:tf_utils",
- "//tensorflow/python/ops/ragged:ragged_functional_ops",
- "//tensorflow/python/ops/ragged:ragged_tensor",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "text_vectorization",
- srcs = [
- "text_vectorization.py",
- ],
- srcs_version = "PY3",
- deps = [
- ":category_encoding",
- ":string_lookup",
- "//tensorflow/python:array_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:string_ops",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/utils:layer_utils",
- "//tensorflow/python/keras/utils:tf_utils",
- "//tensorflow/python/ops/ragged:ragged_functional_ops",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "category_encoding",
- srcs = [
- "category_encoding.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:bincount_ops",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:init_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:sparse_ops",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python:util",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/engine:input_spec",
- "//tensorflow/python/keras/utils:layer_utils",
- "//tensorflow/python/ops/ragged:ragged_tensor",
- "//tensorflow/python/util:tf_export",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "reduction",
- srcs = [
- "reduction.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:platform",
- "//tensorflow/python/keras/engine:base_layer",
- ],
-)
-
-py_library(
- name = "string_lookup",
- srcs = [
- "string_lookup.py",
- ],
- srcs_version = "PY3",
- deps = [
- ":index_lookup",
- ":table_utils",
- "//tensorflow/python:dtypes",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/util:tf_export",
- ],
-)
-
-py_library(
- name = "preprocessing_stage",
- srcs = [
- "preprocessing_stage.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:framework_ops",
- "//tensorflow/python:util",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/utils:tf_utils",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "preprocessing_test_utils",
- srcs = ["preprocessing_test_utils.py"],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:util",
- "//third_party/py/numpy",
- ],
-)
-
-cuda_py_test(
- name = "category_crossing_test",
- srcs = ["category_crossing_test.py"],
- python_version = "PY3",
- shard_count = 4,
- tags = [
- "no_windows", # b/149031156
- ],
- deps = [
- ":category_crossing",
- "//tensorflow/python:array_ops",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:constant_op",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:sparse_ops",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras:testing_utils",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/ops/ragged:ragged_factory_ops",
- "//tensorflow/python/ops/ragged:ragged_tensor",
- "//third_party/py/numpy",
- ],
-)
-
-tf_py_test(
- name = "category_encoding_test",
- srcs = ["category_encoding_test.py"],
- python_version = "PY3",
- deps = [
- ":category_encoding",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/utils:generic_utils",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-distribute_py_test(
- name = "category_encoding_distribution_test",
- srcs = ["category_encoding_distribution_test.py"],
- disable_mlir_bridge = False,
- main = "category_encoding_distribution_test.py",
- python_version = "PY3",
- tags = [
- "multi_and_single_gpu",
- "no_oss", # b/189866692
- "no_rocm",
- "noguitar", # b/190034522
- ],
- tpu_tags = [
- "no_oss", # b/155502591
- ],
- deps = [
- ":category_encoding",
- ":preprocessing_test_utils",
- "//tensorflow/python:config",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_test_combinations_lib",
- "//tensorflow/python/compat:v2_compat",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/distribute:multi_process_runner",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-distribute_py_test(
- name = "category_crossing_distribution_test",
- srcs = ["category_crossing_distribution_test.py"],
- main = "category_crossing_distribution_test.py",
- python_version = "PY3",
- tags = [
- "multi_and_single_gpu",
- ],
- tpu_tags = [
- "no_oss", # b/155502591
- ],
- deps = [
- ":category_crossing",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:config",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_test_combinations_lib",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-distribute_py_test(
- name = "image_preprocessing_distribution_test",
- srcs = ["image_preprocessing_distribution_test.py"],
- main = "image_preprocessing_distribution_test.py",
- python_version = "PY3",
- shard_count = 4,
- tags = [
- "multi_and_single_gpu",
- "no_rocm",
- ],
- tpu_tags = [
- "no_oss",
- "noguitar", # TODO(b/183957207)
- ],
- deps = [
- ":image_preprocessing",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_test_combinations_lib",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-tf_py_test(
- name = "discretization_test",
- srcs = ["discretization_test.py"],
- python_version = "PY3",
- shard_count = 4,
- tags = ["no_rocm"],
- deps = [
- ":discretization",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-distribute_py_test(
- name = "discretization_distribution_test",
- srcs = ["discretization_distribution_test.py"],
- main = "discretization_distribution_test.py",
- python_version = "PY3",
- tags = [
- "multi_and_single_gpu",
- "no_oss", # TODO(b/189956080)
- "no_rocm",
- "noguitar", # b/190034522
- ],
- deps = [
- ":discretization",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:config",
- "//tensorflow/python:framework_test_combinations_lib",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-cuda_py_test(
- name = "hashing_test",
- srcs = ["hashing_test.py"],
- python_version = "PY3",
- shard_count = 4,
- deps = [
- ":hashing",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:constant_op",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:sparse_tensor",
- "//tensorflow/python:tensor_shape",
- "//tensorflow/python:tensor_spec",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras:testing_utils",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/ops/ragged:ragged_factory_ops",
- "//third_party/py/numpy",
- ],
-)
-
-distribute_py_test(
- name = "hashing_distribution_test",
- srcs = ["hashing_distribution_test.py"],
- disable_mlir_bridge = False,
- main = "hashing_distribution_test.py",
- python_version = "PY3",
- tags = [
- "multi_and_single_gpu",
- ],
- deps = [
- ":hashing",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-tf_py_test(
- name = "index_lookup_test",
- srcs = ["index_lookup_test.py"],
- python_version = "PY3",
- tags = ["noasan"], # TODO(b/183961255)
- deps = [
- ":index_lookup",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/utils:generic_utils",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-distribute_py_test(
- name = "index_lookup_distribution_test",
- srcs = ["index_lookup_distribution_test.py"],
- disable_mlir_bridge = False,
- main = "index_lookup_distribution_test.py",
- python_version = "PY3",
- tags = [
- "multi_and_single_gpu",
- "no_rocm",
- ],
- tpu_tags = ["no_oss"],
- deps = [
- ":index_lookup",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-cuda_py_test(
- name = "image_preprocessing_test",
- srcs = ["image_preprocessing_test.py"],
- python_version = "PY3",
- shard_count = 4,
- tags = [
- "no_windows", # TODO(b/184424727): Re-enable this.
- ],
- deps = [
- ":image_preprocessing",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:errors",
- "//tensorflow/python:image_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:random_ops",
- "//tensorflow/python:stateful_random_ops",
- "//tensorflow/python:stateless_random_ops",
- "//tensorflow/python/compat",
- "//tensorflow/python/distribute:mirrored_strategy",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras:testing_utils",
- "//tensorflow/python/keras/engine",
- "//tensorflow/python/keras/utils:generic_utils",
- "//third_party/py/numpy",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-tf_py_test(
- name = "normalization_test",
- srcs = ["normalization_test.py"],
- python_version = "PY3",
- shard_count = 4,
- tags = [
- "broken", # b/170974360
- "noasan", # TODO(b/337374867) fails with -fsanitize=null
- ],
- deps = [
- ":normalization",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-tf_py_test(
- name = "integer_lookup_test",
- srcs = ["integer_lookup_test.py"],
- python_version = "PY3",
- tags = ["noasan"], # TODO(b/183961255)
- deps = [
- ":integer_lookup",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/utils:generic_utils",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-distribute_py_test(
- name = "normalization_distribution_test",
- srcs = ["normalization_distribution_test.py"],
- main = "normalization_distribution_test.py",
- python_version = "PY3",
- tags = [
- "no_cuda_asan",
- "no_oss",
- ],
- deps = [
- ":normalization",
- ":preprocessing_test_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:framework_test_combinations_lib",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/eager:context",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-tf_py_test(
- name = "table_utils_test",
- srcs = ["table_utils_test.py"],
- python_version = "PY3",
- deps = [
- ":table_utils",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/utils:generic_utils",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-tf_py_test(
- name = "text_vectorization_test",
- srcs = ["text_vectorization_test.py"],
- python_version = "PY3",
- shard_count = 4,
- deps = [
- ":preprocessing_test_utils",
- ":text_vectorization",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/utils:generic_utils",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-distribute_py_test(
- name = "text_vectorization_distribution_test",
- srcs = ["text_vectorization_distribution_test.py"],
- main = "text_vectorization_distribution_test.py",
- python_version = "PY3",
- tags = [
- "multi_and_single_gpu",
- "no_rocm",
- ],
- tpu_tags = [
- "no_oss", # b/155502591
- ],
- deps = [
- ":preprocessing_test_utils",
- ":text_vectorization",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:config",
- "//tensorflow/python:dtypes",
- "//tensorflow/python:framework_test_combinations_lib",
- "//tensorflow/python/data/ops:dataset_ops",
- "//tensorflow/python/distribute:combinations",
- "//tensorflow/python/eager:context",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/distribute:strategy_combinations",
- ],
-)
-
-tf_py_test(
- name = "reduction_test",
- srcs = ["reduction_test.py"],
- python_version = "PY3",
- shard_count = 4,
- tags = ["notsan"], # TODO(b/170783154)
- deps = [
- ":reduction",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-tf_py_test(
- name = "string_lookup_test",
- srcs = ["string_lookup_test.py"],
- python_version = "PY3",
- tags = [
- "notsan", #b/168758821
- ],
- deps = [
- ":preprocessing_test_utils",
- ":string_lookup",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/utils:generic_utils",
- "//tensorflow/python/ops/ragged:ragged_string_ops",
- "@absl_py//absl/testing:parameterized",
- ],
-)
-
-tf_py_test(
- name = "preprocessing_stage_test",
- srcs = ["preprocessing_stage_test.py"],
- python_version = "PY3",
- tags = ["no_windows"], # TODO(b/152991402)
- deps = [
- ":preprocessing_stage",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/keras",
- "//third_party/py/numpy",
- "@absl_py//absl/testing:parameterized",
- ],
-)
diff --git a/tensorflow/python/keras/layers/preprocessing/__init__.py b/tensorflow/python/keras/layers/preprocessing/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/tensorflow/python/keras/layers/preprocessing/__init__.py
+++ /dev/null
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
deleted file mode 100644
index 4e0b63e..0000000
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras category crossing preprocessing layers."""
-# pylint: disable=g-classes-have-attributes
-
-import itertools
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.layers.experimental.preprocessing.CategoryCrossing')
-class CategoryCrossing(base_layer.Layer):
- """Category crossing layer.
-
- This layer concatenates multiple categorical inputs into a single categorical
- output (similar to Cartesian product). The output dtype is string.
-
- Usage:
- >>> inp_1 = ['a', 'b', 'c']
- >>> inp_2 = ['d', 'e', 'f']
- >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing()
- >>> layer([inp_1, inp_2])
- <tf.Tensor: shape=(3, 1), dtype=string, numpy=
- array([[b'a_X_d'],
- [b'b_X_e'],
- [b'c_X_f']], dtype=object)>
-
-
- >>> inp_1 = ['a', 'b', 'c']
- >>> inp_2 = ['d', 'e', 'f']
- >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing(
- ... separator='-')
- >>> layer([inp_1, inp_2])
- <tf.Tensor: shape=(3, 1), dtype=string, numpy=
- array([[b'a-d'],
- [b'b-e'],
- [b'c-f']], dtype=object)>
-
- Args:
- depth: depth of input crossing. By default None, all inputs are crossed into
- one output. It can also be an int or tuple/list of ints. Passing an
- integer will create combinations of crossed outputs with depth up to that
- integer, i.e., [1, 2, ..., `depth`), and passing a tuple of integers will
- create crossed outputs with depth for the specified values in the tuple,
- i.e., `depth`=(N1, N2) will create all possible crossed outputs with depth
- equal to N1 or N2. Passing `None` means a single crossed output with all
- inputs. For example, with inputs `a`, `b` and `c`, `depth=2` means the
- output will be [a;b;c;cross(a, b);cross(bc);cross(ca)].
- separator: A string added between each input being joined. Defaults to
- '_X_'.
- name: Name to give to the layer.
- **kwargs: Keyword arguments to construct a layer.
-
- Input shape: a list of string or int tensors or sparse tensors of shape
- `[batch_size, d1, ..., dm]`
-
- Output shape: a single string or int tensor or sparse tensor of shape
- `[batch_size, d1, ..., dm]`
-
- Returns:
- If any input is `RaggedTensor`, the output is `RaggedTensor`.
- Else, if any input is `SparseTensor`, the output is `SparseTensor`.
- Otherwise, the output is `Tensor`.
-
- Example: (`depth`=None)
- If the layer receives three inputs:
- `a=[[1], [4]]`, `b=[[2], [5]]`, `c=[[3], [6]]`
- the output will be a string tensor:
- `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-
- Example: (`depth` is an integer)
- With the same input above, and if `depth`=2,
- the output will be a list of 6 string tensors:
- `[[b'1'], [b'4']]`
- `[[b'2'], [b'5']]`
- `[[b'3'], [b'6']]`
- `[[b'1_X_2'], [b'4_X_5']]`,
- `[[b'2_X_3'], [b'5_X_6']]`,
- `[[b'3_X_1'], [b'6_X_4']]`
-
- Example: (`depth` is a tuple/list of integers)
- With the same input above, and if `depth`=(2, 3)
- the output will be a list of 4 string tensors:
- `[[b'1_X_2'], [b'4_X_5']]`,
- `[[b'2_X_3'], [b'5_X_6']]`,
- `[[b'3_X_1'], [b'6_X_4']]`,
- `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
- """
-
- def __init__(self, depth=None, name=None, separator='_X_', **kwargs):
- super(CategoryCrossing, self).__init__(name=name, **kwargs)
- self.depth = depth
- self.separator = separator
- if isinstance(depth, (tuple, list)):
- self._depth_tuple = depth
- elif depth is not None:
- self._depth_tuple = tuple([i for i in range(1, depth + 1)])
-
- def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
- """Gets the crossed output from a partial list/tuple of inputs."""
- # If ragged_out=True, convert output from sparse to ragged.
- if ragged_out:
- # TODO(momernick): Support separator with ragged_cross.
- if self.separator != '_X_':
- raise ValueError('Non-default separator with ragged input is not '
- 'supported yet, given {}'.format(self.separator))
- return ragged_array_ops.cross(partial_inputs)
- elif sparse_out:
- return sparse_ops.sparse_cross(partial_inputs, separator=self.separator)
- else:
- return sparse_ops.sparse_tensor_to_dense(
- sparse_ops.sparse_cross(partial_inputs, separator=self.separator))
-
- def _preprocess_input(self, inp):
- if isinstance(inp, (list, tuple, np.ndarray)):
- inp = ops.convert_to_tensor_v2_with_dispatch(inp)
- if inp.shape.rank == 1:
- inp = array_ops.expand_dims(inp, axis=-1)
- return inp
-
- def call(self, inputs):
- inputs = [self._preprocess_input(inp) for inp in inputs]
- depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
- ragged_out = sparse_out = False
- if any(tf_utils.is_ragged(inp) for inp in inputs):
- ragged_out = True
- elif any(isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs):
- sparse_out = True
-
- outputs = []
- for depth in depth_tuple:
- if len(inputs) < depth:
- raise ValueError(
- 'Number of inputs cannot be less than depth, got {} input tensors, '
- 'and depth {}'.format(len(inputs), depth))
- for partial_inps in itertools.combinations(inputs, depth):
- partial_out = self.partial_crossing(
- partial_inps, ragged_out, sparse_out)
- outputs.append(partial_out)
- if sparse_out:
- return sparse_ops.sparse_concat_v2(axis=1, sp_inputs=outputs)
- return array_ops.concat(outputs, axis=1)
-
- def compute_output_shape(self, input_shape):
- if not isinstance(input_shape, (tuple, list)):
- raise ValueError('A `CategoryCrossing` layer should be called '
- 'on a list of inputs.')
- input_shapes = input_shape
- batch_size = None
- for inp_shape in input_shapes:
- inp_tensor_shape = tensor_shape.TensorShape(inp_shape).as_list()
- if len(inp_tensor_shape) != 2:
- raise ValueError('Inputs must be rank 2, get {}'.format(input_shapes))
- if batch_size is None:
- batch_size = inp_tensor_shape[0]
- # The second dimension is dynamic based on inputs.
- output_shape = [batch_size, None]
- return tensor_shape.TensorShape(output_shape)
-
- def compute_output_signature(self, input_spec):
- input_shapes = [x.shape for x in input_spec]
- output_shape = self.compute_output_shape(input_shapes)
- if any(
- isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
- for inp_spec in input_spec):
- return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
- elif any(
- isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
- for inp_spec in input_spec):
- return sparse_tensor.SparseTensorSpec(
- shape=output_shape, dtype=dtypes.string)
- return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
-
- def get_config(self):
- config = {
- 'depth': self.depth,
- 'separator': self.separator,
- }
- base_config = super(CategoryCrossing, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
deleted file mode 100644
index 4807f23..0000000
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.category_crossing."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.framework import config
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute.strategy_combinations import all_strategies
-from tensorflow.python.keras.layers.preprocessing import category_crossing
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.platform import test
-
-
-def batch_wrapper(dataset, batch_size, distribution, repeat=None):
- if repeat:
- dataset = dataset.repeat(repeat)
- # TPUs currently require fully defined input shapes, drop_remainder ensures
- # the input will have fully defined shapes.
- if backend.is_tpu_strategy(distribution):
- return dataset.batch(batch_size, drop_remainder=True)
- else:
- return dataset.batch(batch_size)
-
-
-@ds_combinations.generate(
- combinations.combine(
- # Investigate why crossing is not supported with TPU.
- distribution=all_strategies,
- mode=['eager', 'graph']))
-class CategoryCrossingDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_distribution(self, distribution):
- input_array_1 = np.array([['a', 'b'], ['c', 'd']])
- input_array_2 = np.array([['e', 'f'], ['g', 'h']])
- inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(
- {'input_1': input_array_1, 'input_2': input_array_2})
- inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
-
- # pyformat: disable
- expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
- [b'c_X_g', b'c_X_h', b'd_X_g', b'd_X_h']]
- config.set_soft_device_placement(True)
-
- with distribution.scope():
- input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string,
- name='input_1')
- input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string,
- name='input_2')
- input_data = [input_data_1, input_data_2]
- layer = category_crossing.CategoryCrossing()
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(inp_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
deleted file mode 100644
index e65bccb..0000000
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for categorical preprocessing layers."""
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers.preprocessing import category_crossing
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import test
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoryCrossingTest(keras_parameterized.TestCase):
-
- def test_crossing_sparse_inputs(self):
- layer = category_crossing.CategoryCrossing()
- inputs_0 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [1, 1]],
- values=['a', 'b', 'c'],
- dense_shape=[2, 2])
- inputs_1 = sparse_tensor.SparseTensor(
- indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
- output = layer([inputs_0, inputs_1])
- self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
- self.assertAllEqual([b'a_X_d', b'b_X_e', b'c_X_e'], output.values)
-
- def test_crossing_sparse_inputs_custom_sep(self):
- layer = category_crossing.CategoryCrossing(separator='_Y_')
- inputs_0 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [1, 1]],
- values=['a', 'b', 'c'],
- dense_shape=[2, 2])
- inputs_1 = sparse_tensor.SparseTensor(
- indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
- output = layer([inputs_0, inputs_1])
- self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
- self.assertAllEqual([b'a_Y_d', b'b_Y_e', b'c_Y_e'], output.values)
-
- def test_crossing_sparse_inputs_empty_sep(self):
- layer = category_crossing.CategoryCrossing(separator='')
- inputs_0 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [1, 1]],
- values=['a', 'b', 'c'],
- dense_shape=[2, 2])
- inputs_1 = sparse_tensor.SparseTensor(
- indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
- output = layer([inputs_0, inputs_1])
- self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
- self.assertAllEqual([b'ad', b'be', b'ce'], output.values)
-
- def test_crossing_sparse_inputs_depth_int(self):
- layer = category_crossing.CategoryCrossing(depth=1)
- inputs_0 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [2, 0]],
- values=['a', 'b', 'c'],
- dense_shape=[3, 1])
- inputs_1 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [2, 0]],
- values=['d', 'e', 'f'],
- dense_shape=[3, 1])
- output = layer([inputs_0, inputs_1])
- self.assertIsInstance(output, sparse_tensor.SparseTensor)
- output = sparse_ops.sparse_tensor_to_dense(output)
- expected_out = [[b'a', b'd'], [b'b', b'e'], [b'c', b'f']]
- self.assertAllEqual(expected_out, output)
-
- def test_crossing_sparse_inputs_depth_tuple(self):
- layer = category_crossing.CategoryCrossing(depth=(2, 3))
- inputs_0 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [2, 0]],
- values=['a', 'b', 'c'],
- dense_shape=[3, 1])
- inputs_1 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [2, 0]],
- values=['d', 'e', 'f'],
- dense_shape=[3, 1])
- inputs_2 = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 0], [2, 0]],
- values=['g', 'h', 'i'],
- dense_shape=[3, 1])
- inp_0_t = input_layer.Input(shape=(1,), sparse=True, dtype=dtypes.string)
- inp_1_t = input_layer.Input(shape=(1,), sparse=True, dtype=dtypes.string)
- inp_2_t = input_layer.Input(shape=(1,), sparse=True, dtype=dtypes.string)
- out_t = layer([inp_0_t, inp_1_t, inp_2_t])
- model = training.Model([inp_0_t, inp_1_t, inp_2_t], out_t)
- output = model.predict([inputs_0, inputs_1, inputs_2])
- self.assertIsInstance(output, sparse_tensor.SparseTensor)
- output = sparse_ops.sparse_tensor_to_dense(output)
- expected_outputs_0 = [[b'a_X_d', b'a_X_g', b'd_X_g', b'a_X_d_X_g']]
- expected_outputs_1 = [[b'b_X_e', b'b_X_h', b'e_X_h', b'b_X_e_X_h']]
- expected_outputs_2 = [[b'c_X_f', b'c_X_i', b'f_X_i', b'c_X_f_X_i']]
- expected_out = array_ops.concat(
- [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0)
- self.assertAllEqual(expected_out, output)
-
- def test_crossing_ragged_inputs(self):
- inputs_0 = ragged_factory_ops.constant(
- [['omar', 'skywalker'], ['marlo']],
- dtype=dtypes.string)
- inputs_1 = ragged_factory_ops.constant(
- [['a'], ['b']],
- dtype=dtypes.string)
- inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
-
- non_hashed_layer = category_crossing.CategoryCrossing()
- out_t = non_hashed_layer([inp_0_t, inp_1_t])
- model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
- expected_output = [[b'omar_X_a', b'skywalker_X_a'], [b'marlo_X_b']]
- self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
-
- def test_crossing_ragged_inputs_depth_int(self):
- layer = category_crossing.CategoryCrossing(depth=1)
- inputs_0 = ragged_factory_ops.constant([['a'], ['b'], ['c']])
- inputs_1 = ragged_factory_ops.constant([['d'], ['e'], ['f']])
- output = layer([inputs_0, inputs_1])
- expected_output = [[b'a', b'd'], [b'b', b'e'], [b'c', b'f']]
- self.assertIsInstance(output, ragged_tensor.RaggedTensor)
- self.assertAllEqual(expected_output, output)
-
- layer = category_crossing.CategoryCrossing(depth=2)
- inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- out_t = layer([inp_0_t, inp_1_t])
- model = training.Model([inp_0_t, inp_1_t], out_t)
- expected_output = [[b'a', b'd', b'a_X_d'], [b'b', b'e', b'b_X_e'],
- [b'c', b'f', b'c_X_f']]
- self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
-
- def test_crossing_ragged_inputs_depth_tuple(self):
- layer = category_crossing.CategoryCrossing(depth=[2, 3])
- inputs_0 = ragged_factory_ops.constant([['a'], ['b'], ['c']])
- inputs_1 = ragged_factory_ops.constant([['d'], ['e'], ['f']])
- inputs_2 = ragged_factory_ops.constant([['g'], ['h'], ['i']])
- inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- inp_2_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- out_t = layer([inp_0_t, inp_1_t, inp_2_t])
- model = training.Model([inp_0_t, inp_1_t, inp_2_t], out_t)
- expected_output = [[b'a_X_d', b'a_X_g', b'd_X_g', b'a_X_d_X_g'],
- [b'b_X_e', b'b_X_h', b'e_X_h', b'b_X_e_X_h'],
- [b'c_X_f', b'c_X_i', b'f_X_i', b'c_X_f_X_i']]
- output = model.predict([inputs_0, inputs_1, inputs_2])
- self.assertIsInstance(output, ragged_tensor.RaggedTensor)
- self.assertAllEqual(expected_output, output)
-
- def test_crossing_with_dense_inputs(self):
- layer = category_crossing.CategoryCrossing()
- inputs_0 = np.asarray([[1, 2]])
- inputs_1 = np.asarray([[1, 3]])
- output = layer([inputs_0, inputs_1])
- self.assertAllEqual([[b'1_X_1', b'1_X_3', b'2_X_1', b'2_X_3']], output)
-
- def test_crossing_with_list_inputs(self):
- layer = category_crossing.CategoryCrossing()
- inputs_0 = [[1, 2]]
- inputs_1 = [[1, 3]]
- output = layer([inputs_0, inputs_1])
- self.assertAllEqual([[b'1_X_1', b'1_X_3', b'2_X_1', b'2_X_3']], output)
-
- inputs_0 = [1, 2]
- inputs_1 = [1, 3]
- output = layer([inputs_0, inputs_1])
- self.assertAllEqual([[b'1_X_1'], [b'2_X_3']], output)
-
- inputs_0 = np.asarray([1, 2])
- inputs_1 = np.asarray([1, 3])
- output = layer([inputs_0, inputs_1])
- self.assertAllEqual([[b'1_X_1'], [b'2_X_3']], output)
-
- def test_crossing_dense_inputs_depth_int(self):
- layer = category_crossing.CategoryCrossing(depth=1)
- inputs_0 = constant_op.constant([['a'], ['b'], ['c']])
- inputs_1 = constant_op.constant([['d'], ['e'], ['f']])
- output = layer([inputs_0, inputs_1])
- expected_output = [[b'a', b'd'], [b'b', b'e'], [b'c', b'f']]
- self.assertAllEqual(expected_output, output)
-
- layer = category_crossing.CategoryCrossing(depth=2)
- inp_0_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
- inp_1_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
- out_t = layer([inp_0_t, inp_1_t])
- model = training.Model([inp_0_t, inp_1_t], out_t)
- crossed_output = [[b'a_X_d'], [b'b_X_e'], [b'c_X_f']]
- expected_output = array_ops.concat([expected_output, crossed_output],
- axis=1)
- self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
-
- def test_crossing_dense_inputs_depth_tuple(self):
- layer = category_crossing.CategoryCrossing(depth=[2, 3])
- inputs_0 = constant_op.constant([['a'], ['b'], ['c']])
- inputs_1 = constant_op.constant([['d'], ['e'], ['f']])
- inputs_2 = constant_op.constant([['g'], ['h'], ['i']])
- inp_0_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
- inp_1_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
- inp_2_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
- out_t = layer([inp_0_t, inp_1_t, inp_2_t])
- model = training.Model([inp_0_t, inp_1_t, inp_2_t], out_t)
- expected_outputs_0 = [[b'a_X_d', b'a_X_g', b'd_X_g', b'a_X_d_X_g']]
- expected_outputs_1 = [[b'b_X_e', b'b_X_h', b'e_X_h', b'b_X_e_X_h']]
- expected_outputs_2 = [[b'c_X_f', b'c_X_i', b'f_X_i', b'c_X_f_X_i']]
- expected_output = array_ops.concat(
- [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0)
- self.assertAllEqual(expected_output,
- model.predict([inputs_0, inputs_1, inputs_2]))
-
- def test_crossing_compute_output_signature(self):
- input_shapes = [
- tensor_shape.TensorShape([2, 2]),
- tensor_shape.TensorShape([2, 3])
- ]
- input_specs = [
- tensor_spec.TensorSpec(input_shape, dtypes.string)
- for input_shape in input_shapes
- ]
- layer = category_crossing.CategoryCrossing()
- output_spec = layer.compute_output_signature(input_specs)
- self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
- self.assertEqual(output_spec.dtype, dtypes.string)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = category_crossing.CategoryCrossing(depth=2, name='hashing')
- config = layer.get_config()
- layer_1 = category_crossing.CategoryCrossing.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
- layer = category_crossing.CategoryCrossing(name='hashing')
- config = layer.get_config()
- layer_1 = category_crossing.CategoryCrossing.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
deleted file mode 100644
index fde5927..0000000
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras CategoryEncoding preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import bincount_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import keras_export
-
-INT = "int"
-ONE_HOT = "one_hot"
-MULTI_HOT = "multi_hot"
-COUNT = "count"
-
-
-@keras_export("keras.layers.experimental.preprocessing.CategoryEncoding")
-class CategoryEncoding(base_layer.Layer):
- """Category encoding layer.
-
- This layer provides options for condensing data into a categorical encoding
- when the total number of tokens are known in advance. It accepts integer
- values as inputs and outputs a dense representation (one sample = 1-index
- tensor of float values representing data about the sample's tokens) of those
- inputs. For integer inputs where the total number of tokens is not known, see
- `tf.keras.layers.experimental.preprocessing.IntegerLookup`.
-
- Examples:
-
- **One-hot encoding data**
-
- >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
- ... num_tokens=4, output_mode="one_hot")
- >>> layer([3, 2, 0, 1])
- <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
- array([[0., 0., 0., 1.],
- [0., 0., 1., 0.],
- [1., 0., 0., 0.],
- [0., 1., 0., 0.]], dtype=float32)>
-
- **Multi-hot encoding data**
-
- >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
- ... num_tokens=4, output_mode="multi_hot")
- >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
- <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
- array([[1., 1., 0., 0.],
- [1., 0., 0., 0.],
- [0., 1., 1., 0.],
- [0., 1., 0., 1.]], dtype=float32)>
-
- **Using weighted inputs in `"count"` mode**
-
- >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
- ... num_tokens=4, output_mode="count")
- >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
- >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
- <tf.Tensor: shape=(4, 4), dtype=float64, numpy=
- array([[0.1, 0.2, 0. , 0. ],
- [0.2, 0. , 0. , 0. ],
- [0. , 0.2, 0.3, 0. ],
- [0. , 0.2, 0. , 0.4]])>
-
- Args:
- num_tokens: The total number of tokens the layer should support. All inputs
- to the layer must integers in the range 0 <= value < num_tokens or an
- error will be thrown.
- output_mode: Specification for the output of the layer.
- Defaults to `"multi_hot"`. Values can be `"one_hot"`, `"multi_hot"` or
- `"count"`, configuring the layer as follows:
- - `"one_hot"`: Encodes each individual element in the input into an
- array of `num_tokens` size, containing a 1 at the element index. If
- the last dimension is size 1, will encode on that dimension. If the
- last dimension is not size 1, will append a new dimension for the
- encoded output.
- - `"multi_hot"`: Encodes each sample in the input into a single array
- of `num_tokens` size, containing a 1 for each vocabulary term present
- in the sample. Treats the last dimension as the sample dimension, if
- input shape is (..., sample_length), output shape will be
- (..., num_tokens).
- - `"count"`: As `"multi_hot"`, but the int array contains a count of the
- number of times the token at that index appeared in the sample.
- sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
- `Tensor`. Defaults to `False`.
-
- Call arguments:
- inputs: A 2D tensor `(samples, timesteps)`.
- count_weights: A 2D tensor in the same shape as `inputs` indicating the
- weight for each sample value when summing up in `count` mode. Not used in
- `"multi_hot"` mode.
- """
-
- def __init__(self,
- num_tokens=None,
- output_mode=MULTI_HOT,
- sparse=False,
- **kwargs):
- # max_tokens is an old name for the num_tokens arg we continue to support
- # because of usage.
- if "max_tokens" in kwargs:
- logging.warning(
- "max_tokens is deprecated, please use num_tokens instead.")
- num_tokens = kwargs["max_tokens"]
- del kwargs["max_tokens"]
-
- super(CategoryEncoding, self).__init__(**kwargs)
-
- # Support deprecated names for output_modes.
- if output_mode == "binary":
- output_mode = MULTI_HOT
- # 'output_mode' must be one of (COUNT, ONE_HOT, MULTI_HOT)
- layer_utils.validate_string_arg(
- output_mode,
- allowable_strings=(COUNT, ONE_HOT, MULTI_HOT),
- layer_name="CategoryEncoding",
- arg_name="output_mode")
-
- if num_tokens is None:
- raise ValueError("num_tokens must be set to use this layer. If the "
- "number of tokens is not known beforehand, use the "
- "IntegerLookup layer instead.")
- if num_tokens < 1:
- raise ValueError("num_tokens must be >= 1.")
-
- self.num_tokens = num_tokens
- self.output_mode = output_mode
- self.sparse = sparse
-
- def compute_output_shape(self, input_shape):
- if not input_shape:
- return tensor_shape.TensorShape([self.num_tokens])
- if self.output_mode == ONE_HOT and input_shape[-1] != 1:
- return tensor_shape.TensorShape(input_shape + [self.num_tokens])
- else:
- return tensor_shape.TensorShape(input_shape[:-1] + [self.num_tokens])
-
- def compute_output_signature(self, input_spec):
- output_shape = self.compute_output_shape(input_spec.shape.as_list())
- if self.sparse:
- return sparse_tensor.SparseTensorSpec(
- shape=output_shape, dtype=dtypes.int64)
- else:
- return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)
-
- def get_config(self):
- config = {
- "num_tokens": self.num_tokens,
- "output_mode": self.output_mode,
- "sparse": self.sparse,
- }
- base_config = super(CategoryEncoding, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
- def call(self, inputs, count_weights=None):
- if isinstance(inputs, (list, np.ndarray)):
- inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
-
- def expand_dims(inputs, axis):
- if tf_utils.is_sparse(inputs):
- return sparse_ops.sparse_expand_dims(inputs, axis)
- else:
- return array_ops.expand_dims(inputs, axis)
-
- original_shape = inputs.shape
- # In all cases, we should uprank scalar input to a single sample.
- if inputs.shape.rank == 0:
- inputs = expand_dims(inputs, -1)
- # One hot will unprank only if the final output dimension is not already 1.
- if self.output_mode == ONE_HOT:
- if inputs.shape[-1] != 1:
- inputs = expand_dims(inputs, -1)
-
- # TODO(b/190445202): remove output rank restriction.
- if inputs.shape.rank > 2:
- raise ValueError(
- "Received input shape {}, which would result in output rank {}. "
- "Currently only outputs up to rank 2 are supported.".format(
- original_shape, inputs.shape.rank))
-
- if count_weights is not None and self.output_mode != COUNT:
- raise ValueError(
- "`count_weights` is not used when `output_mode` is not `'count'`. "
- "Received `count_weights={}`.".format(count_weights))
-
- out_depth = self.num_tokens
- binary_output = self.output_mode in (MULTI_HOT, ONE_HOT)
- if isinstance(inputs, sparse_tensor.SparseTensor):
- max_value = math_ops.reduce_max(inputs.values)
- min_value = math_ops.reduce_min(inputs.values)
- else:
- max_value = math_ops.reduce_max(inputs)
- min_value = math_ops.reduce_min(inputs)
- condition = math_ops.logical_and(
- math_ops.greater(
- math_ops.cast(out_depth, max_value.dtype), max_value),
- math_ops.greater_equal(
- min_value, math_ops.cast(0, min_value.dtype)))
- assertion = control_flow_ops.Assert(condition, [
- "Input values must be in the range 0 <= values < num_tokens"
- " with num_tokens={}".format(out_depth)
- ])
- with ops.control_dependencies([assertion]):
- if self.sparse:
- return sparse_bincount(inputs, out_depth, binary_output,
- count_weights)
- else:
- return dense_bincount(inputs, out_depth, binary_output,
- count_weights)
-
-
-def sparse_bincount(inputs, out_depth, binary_output, count_weights=None):
- """Apply binary or count encoding to an input and return a sparse tensor."""
- result = bincount_ops.sparse_bincount(
- inputs,
- weights=count_weights,
- minlength=out_depth,
- maxlength=out_depth,
- axis=-1,
- binary_output=binary_output)
- if inputs.shape.rank == 1:
- output_shape = (out_depth,)
- else:
- result = math_ops.cast(result, backend.floatx())
- batch_size = array_ops.shape(result)[0]
- output_shape = (batch_size, out_depth)
- result = sparse_tensor.SparseTensor(
- indices=result.indices,
- values=result.values,
- dense_shape=output_shape)
- return result
-
-
-def dense_bincount(inputs, out_depth, binary_output, count_weights=None):
- """Apply binary or count encoding to an input."""
- result = bincount_ops.bincount(
- inputs,
- weights=count_weights,
- minlength=out_depth,
- maxlength=out_depth,
- dtype=backend.floatx(),
- axis=-1,
- binary_output=binary_output)
- if inputs.shape.rank == 1:
- result.set_shape(tensor_shape.TensorShape((out_depth,)))
- else:
- batch_size = inputs.shape.as_list()[0]
- result.set_shape(tensor_shape.TensorShape((batch_size, out_depth)))
- return result
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
deleted file mode 100644
index e4e0d12..0000000
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.category_encoding."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.framework import config
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import strategy_combinations
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-
-
-def batch_wrapper(dataset, batch_size, strategy, repeat=None):
- if repeat:
- dataset = dataset.repeat(repeat)
- # TPUs currently require fully defined input shapes, drop_remainder ensures
- # the input will have fully defined shapes.
- if backend.is_tpu_strategy(strategy):
- return dataset.batch(batch_size, drop_remainder=True)
- else:
- return dataset.batch(batch_size)
-
-
-@ds_combinations.generate(
- combinations.combine(
- # (b/156783625): Outside compilation failed for eager mode only.
- strategy=strategy_combinations.strategies_minus_tpu +
- strategy_combinations.multi_worker_mirrored_strategies,
- mode=["eager", "graph"]))
-class CategoryEncodingDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_strategy(self, strategy):
- input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
- inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array)
- inp_dataset = batch_wrapper(inp_dataset, 2, strategy)
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0, 0],
- [1, 1, 0, 1, 0, 0]]
- # pyformat: enable
- num_tokens = 6
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(4,), dtype=dtypes.int32)
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(inp_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-if __name__ == "__main__":
- v2_compat.enable_v2_behavior()
- multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
deleted file mode 100644
index f955ee4..0000000
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras text category_encoding preprocessing layer."""
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import test
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoryEncodingInputTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- def test_dense_input_sparse_output(self):
- input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
-
- # The expected output should be (X for missing value):
- # [[X, 1, 1, 1, X, X]
- # [1, X, X, 2, X, X]]
- expected_indices = [[0, 1], [0, 2], [0, 3], [1, 0], [1, 3]]
- expected_values = [1, 1, 1, 1, 2]
- num_tokens = 6
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
- int_data = layer(input_data)
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- sp_output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_values, sp_output_dataset.values)
- self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
- # Assert sparse output is same as dense output.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens,
- output_mode=category_encoding.COUNT,
- sparse=False)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(
- sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
- output_dataset)
-
- def test_sparse_input(self):
- input_array = np.array([[1, 2, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
- sparse_tensor_data = sparse_ops.from_dense(input_array)
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0, 0],
- [0, 1, 0, 1, 0, 0]]
- # pyformat: enable
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(sparse_tensor_data, steps=1)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_sparse_input_with_weights(self):
- input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64)
- weights_array = np.array([[.1, .2, .3, .4], [.2, .1, .4, .3]])
- sparse_tensor_data = sparse_ops.from_dense(input_array)
- sparse_weight_data = sparse_ops.from_dense(weights_array)
-
- # pyformat: disable
- expected_output = [[0, .1, .2, .3, .4, 0],
- [0, .4, 0, .1, .5, 0]]
- # pyformat: enable
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- weight_data = keras.Input(shape=(None,), dtype=dtypes.float32, sparse=True)
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.COUNT)
- int_data = layer(input_data, count_weights=weight_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
- output_dataset = model.predict([sparse_tensor_data, sparse_weight_data],
- steps=1)
- self.assertAllClose(expected_output, output_dataset)
-
- def test_sparse_input_sparse_output(self):
- sp_inp = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]],
- values=[0, 2, 1, 1, 0],
- dense_shape=[4, 2])
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-
- # The expected output should be (X for missing value):
- # [[1, X, X, X]
- # [X, X, 1, X]
- # [X, 2, X, X]
- # [1, X, X, X]]
- expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
- expected_values = [1, 1, 2, 1]
- num_tokens = 6
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
- int_data = layer(input_data)
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- sp_output_dataset = model.predict(sp_inp, steps=1)
- self.assertAllEqual(expected_values, sp_output_dataset.values)
- self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
- # Assert sparse output is same as dense output.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens,
- output_mode=category_encoding.COUNT,
- sparse=False)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(sp_inp, steps=1)
- self.assertAllEqual(
- sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
- output_dataset)
-
- def test_sparse_input_sparse_output_with_weights(self):
- indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]]
- sp_inp = sparse_tensor.SparseTensor(
- indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2])
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- sp_weight = sparse_tensor.SparseTensor(
- indices=indices, values=[.1, .2, .4, .3, .2], dense_shape=[4, 2])
- weight_data = keras.Input(shape=(None,), dtype=dtypes.float32, sparse=True)
-
- # The expected output should be (X for missing value):
- # [[1, X, X, X]
- # [X, X, 1, X]
- # [X, 2, X, X]
- # [1, X, X, X]]
- expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
- expected_values = [.1, .2, .7, .2]
- num_tokens = 6
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
- int_data = layer(input_data, count_weights=weight_data)
-
- model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
- sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1)
- self.assertAllClose(expected_values, sp_output_dataset.values)
- self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
- def test_ragged_input(self):
- input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 1]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0, 0],
- [0, 1, 0, 1, 0, 0]]
- # pyformat: enable
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- int_data = layer(input_data)
-
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_ragged_input_sparse_output(self):
- input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 3]])
-
- # The expected output should be (X for missing value):
- # [[X, 1, 1, 1]
- # [X, X, X, 2]]
- expected_indices = [[0, 1], [0, 2], [0, 3], [1, 3]]
- expected_values = [1, 1, 1, 2]
- num_tokens = 6
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
- int_data = layer(input_data)
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- sp_output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_values, sp_output_dataset.values)
- self.assertAllEqual(expected_indices, sp_output_dataset.indices)
-
- # Assert sparse output is same as dense output.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens,
- output_mode=category_encoding.COUNT,
- sparse=False)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(
- sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
- output_dataset)
-
- def test_sparse_output_and_dense_layer(self):
- input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
-
- num_tokens = 4
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
- encoding_layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.COUNT, sparse=True)
- int_data = encoding_layer(input_data)
- dense_layer = keras.layers.Dense(units=1)
- output_data = dense_layer(int_data)
-
- model = keras.Model(inputs=input_data, outputs=output_data)
- _ = model.predict(input_array, steps=1)
-
- def test_dense_oov_input(self):
- valid_array = constant_op.constant([[0, 1, 2], [0, 1, 2]])
- invalid_array = constant_op.constant([[0, 1, 2], [2, 3, 1]])
- num_tokens = 3
- expected_output_shape = [None, num_tokens]
- encoder_layer = category_encoding.CategoryEncoding(num_tokens)
- input_data = keras.Input(shape=(3,), dtype=dtypes.int32)
- int_data = encoder_layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
- model = keras.Model(inputs=input_data, outputs=int_data)
- # Call predict once on valid input to compile a graph and test control flow.
- _ = model.predict(valid_array, steps=1)
- with self.assertRaisesRegex(
- errors.InvalidArgumentError,
- ".*must be in the range 0 <= values < num_tokens.*"):
- _ = model.predict(invalid_array, steps=1)
-
- def test_dense_negative(self):
- valid_array = constant_op.constant([[0, 1, 2], [0, 1, 2]])
- invalid_array = constant_op.constant([[1, 2, 0], [2, 2, -1]])
- num_tokens = 3
- expected_output_shape = [None, num_tokens]
- encoder_layer = category_encoding.CategoryEncoding(num_tokens)
- input_data = keras.Input(shape=(3,), dtype=dtypes.int32)
- int_data = encoder_layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
- model = keras.Model(inputs=input_data, outputs=int_data)
- # Call predict once on valid input to compile a graph and test control flow.
- _ = model.predict(valid_array, steps=1)
- with self.assertRaisesRegex(
- errors.InvalidArgumentError,
- ".*must be in the range 0 <= values < num_tokens.*"):
- _ = model.predict(invalid_array, steps=1)
-
- def test_legacy_max_tokens_arg(self):
- input_array = np.array([[1, 2, 3, 1]])
- expected_output = [[0, 1, 1, 1, 0, 0]]
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
- layer = category_encoding.CategoryEncoding(
- max_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes
-class CategoryEncodingOutputTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- def test_one_hot_output(self):
- input_data = np.array([[3], [2], [0], [1]])
- expected_output = [
- [0, 0, 0, 1],
- [0, 0, 1, 0],
- [1, 0, 0, 0],
- [0, 1, 0, 0],
- ]
- num_tokens = 4
- expected_output_shape = [None, num_tokens]
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
- inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
- output_dataset = model(input_data)
- self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_one_hot_output_rank_one_input(self):
- input_data = np.array([3, 2, 0, 1])
- expected_output = [
- [0, 0, 0, 1],
- [0, 0, 1, 0],
- [1, 0, 0, 0],
- [0, 1, 0, 0],
- ]
- num_tokens = 4
- expected_output_shape = [None, num_tokens]
-
- # Test call on layer directly.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
- output_data = layer(input_data)
- self.assertAllEqual(expected_output, output_data)
-
- # Test call on model.
- inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
- output_data = model(input_data)
- self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
- self.assertAllEqual(expected_output, output_data)
-
- def test_one_hot_output_rank_zero_input(self):
- input_data = np.array(3)
- expected_output = [0, 0, 0, 1]
- num_tokens = 4
- expected_output_shape = [None, num_tokens]
-
- # Test call on layer directly.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.ONE_HOT)
- output_data = layer(input_data)
- self.assertAllEqual(expected_output, output_data)
-
- # Test call on model.
- inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
- output_data = model(input_data)
-
- self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
- self.assertAllEqual(expected_output, output_data)
-
- def test_one_hot_rank_3_output_fails(self):
- layer = category_encoding.CategoryEncoding(
- num_tokens=4, output_mode=category_encoding.ONE_HOT)
- with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"):
- _ = layer(keras.Input(shape=(4,), dtype=dtypes.int32))
- with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"):
- _ = layer(np.array([[3, 2, 0, 1], [3, 2, 0, 1]]))
-
- def test_multi_hot_output(self):
- input_data = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
- expected_output = [
- [0, 1, 1, 1, 0, 0],
- [1, 1, 0, 1, 0, 0],
- ]
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- inputs = keras.Input(shape=(None,), dtype=dtypes.int32)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
- output_data = model.predict(input_data)
- self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
- self.assertAllEqual(expected_output, output_data)
-
- def test_multi_hot_output_rank_one_input(self):
- input_data = np.array([3, 2, 0, 1])
- expected_output = [1, 1, 1, 1, 0, 0]
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- # Test call on layer directly.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- output_data = layer(input_data)
- self.assertAllEqual(expected_output, output_data)
-
- # Test call on model.
- inputs = keras.Input(shape=(4,), dtype=dtypes.int32)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
- output_data = model(input_data)
- self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
- self.assertAllEqual(expected_output, output_data)
-
- def test_multi_hot_output_rank_zero_input(self):
- input_data = np.array(3)
- expected_output = [0, 0, 0, 1, 0, 0]
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- # Test call on layer directly.
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=category_encoding.MULTI_HOT)
- output_data = layer(input_data)
- self.assertAllEqual(expected_output, output_data)
-
- # Test call on model.
- inputs = keras.Input(shape=(4,), dtype=dtypes.int32)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
- output_data = model(input_data)
- self.assertAllEqual(expected_output_shape, outputs.shape.as_list())
- self.assertAllEqual(expected_output, output_data)
-
- def test_multi_hot_rank_3_output_fails(self):
- layer = category_encoding.CategoryEncoding(
- num_tokens=4, output_mode=category_encoding.ONE_HOT)
- with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"):
- _ = layer(keras.Input(shape=(3, 4,), dtype=dtypes.int32))
- with self.assertRaisesRegex(ValueError, "only outputs up to rank 2"):
- _ = layer(np.array([[[3, 2, 0, 1], [3, 2, 0, 1]]]))
-
- def test_count_output(self):
- input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-
- # pyformat: disable
- expected_output = [[0, 2, 1, 1, 0, 0],
- [2, 1, 0, 1, 0, 0]]
- # pyformat: enable
- num_tokens = 6
- expected_output_shape = [None, num_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
- layer = category_encoding.CategoryEncoding(
- num_tokens=6, output_mode=category_encoding.COUNT)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-class CategoryEncodingModelBuildingTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters(
- {
- "testcase_name": "count_output",
- "num_tokens": 5,
- "output_mode": category_encoding.COUNT
- }, {
- "testcase_name": "multi_hot_output",
- "num_tokens": 5,
- "output_mode": category_encoding.MULTI_HOT
- })
- def test_end_to_end_bagged_modeling(self, output_mode, num_tokens):
- input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
- layer = category_encoding.CategoryEncoding(
- num_tokens=num_tokens, output_mode=output_mode)
-
- weights = []
- if num_tokens is None:
- layer.set_num_elements(5)
- layer.set_weights(weights)
-
- int_data = layer(input_data)
- float_data = backend.cast(int_data, dtype="float32")
- output_data = core.Dense(64)(float_data)
- model = keras.Model(inputs=input_data, outputs=output_data)
- _ = model.predict(input_array)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
deleted file mode 100644
index aaaaa51..0000000
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras discretization preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import sort_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import keras_export
-
-
-def summarize(values, epsilon):
- """Reduce a 1D sequence of values to a summary.
-
- This algorithm is based on numpy.quantiles but modified to allow for
- intermediate steps between multiple data sets. It first finds the target
- number of bins as the reciprocal of epsilon and then takes the individual
- values spaced at appropriate intervals to arrive at that target.
- The final step is to return the corresponding counts between those values
- If the target num_bins is larger than the size of values, the whole array is
- returned (with weights of 1).
-
- Args:
- values: 1-D `np.ndarray` to be summarized.
- epsilon: A `'float32'` that determines the approxmiate desired precision.
-
- Returns:
- A 2-D `np.ndarray` that is a summary of the inputs. First column is the
- interpolated partition values, the second is the weights (counts).
- """
-
- values = array_ops.reshape(values, [-1])
- values = sort_ops.sort(values)
- elements = math_ops.cast(array_ops.size(values), dtypes.float32)
- num_buckets = 1. / epsilon
- increment = math_ops.cast(elements / num_buckets, dtypes.int32)
- start = increment
- step = math_ops.maximum(increment, 1)
- boundaries = values[start::step]
- weights = array_ops.ones_like(boundaries)
- weights = weights * math_ops.cast(step, dtypes.float32)
- return array_ops.stack([boundaries, weights])
-
-
-def compress(summary, epsilon):
- """Compress a summary to within `epsilon` accuracy.
-
- The compression step is needed to keep the summary sizes small after merging,
- and also used to return the final target boundaries. It finds the new bins
- based on interpolating cumulative weight percentages from the large summary.
- Taking the difference of the cumulative weights from the previous bin's
- cumulative weight will give the new weight for that bin.
-
- Args:
- summary: 2-D `np.ndarray` summary to be compressed.
- epsilon: A `'float32'` that determines the approxmiate desired precision.
-
- Returns:
- A 2-D `np.ndarray` that is a compressed summary. First column is the
- interpolated partition values, the second is the weights (counts).
- """
- # TODO(b/184863356): remove the numpy escape hatch here.
- return script_ops.numpy_function(
- lambda s: _compress_summary_numpy(s, epsilon), [summary], dtypes.float32)
-
-
-def _compress_summary_numpy(summary, epsilon):
- """Compress a summary with numpy."""
- if summary.shape[1] * epsilon < 1:
- return summary
-
- percents = epsilon + np.arange(0.0, 1.0, epsilon)
- cum_weights = summary[1].cumsum()
- cum_weight_percents = cum_weights / cum_weights[-1]
- new_bins = np.interp(percents, cum_weight_percents, summary[0])
- cum_weights = np.interp(percents, cum_weight_percents, cum_weights)
- new_weights = cum_weights - np.concatenate((np.array([0]), cum_weights[:-1]))
- summary = np.stack((new_bins, new_weights))
- return summary.astype(np.float32)
-
-
-def merge_summaries(prev_summary, next_summary, epsilon):
- """Weighted merge sort of summaries.
-
- Given two summaries of distinct data, this function merges (and compresses)
- them to stay within `epsilon` error tolerance.
-
- Args:
- prev_summary: 2-D `np.ndarray` summary to be merged with `next_summary`.
- next_summary: 2-D `np.ndarray` summary to be merged with `prev_summary`.
- epsilon: A float that determines the approxmiate desired precision.
-
- Returns:
- A 2-D `np.ndarray` that is a merged summary. First column is the
- interpolated partition values, the second is the weights (counts).
- """
- merged = array_ops.concat((prev_summary, next_summary), axis=1)
- merged = array_ops.gather_v2(merged, sort_ops.argsort(merged[0]), axis=1)
- return compress(merged, epsilon)
-
-
-def get_bin_boundaries(summary, num_bins):
- return compress(summary, 1.0 / num_bins)[0, :-1]
-
-
-@keras_export("keras.layers.experimental.preprocessing.Discretization")
-class Discretization(base_preprocessing_layer.PreprocessingLayer):
- """Buckets data into discrete ranges.
-
- This layer will place each element of its input data into one of several
- contiguous ranges and output an integer index indicating which range each
- element was placed in.
-
- Input shape:
- Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
-
- Output shape:
- Same as input shape.
-
- Attributes:
- bin_boundaries: A list of bin boundaries. The leftmost and rightmost bins
- will always extend to `-inf` and `inf`, so `bin_boundaries=[0., 1., 2.]`
- generates bins `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`. If
- this option is set, `adapt` should not be called.
- num_bins: The integer number of bins to compute. If this option is set,
- `adapt` should be called to learn the bin boundaries.
- epsilon: Error tolerance, typically a small fraction close to zero (e.g.
- 0.01). Higher values of epsilon increase the quantile approximation, and
- hence result in more unequal buckets, but could improve performance
- and resource consumption.
-
- Examples:
-
- Bucketize float values based on provided buckets.
- >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
- >>> layer = tf.keras.layers.experimental.preprocessing.Discretization(
- ... bin_boundaries=[0., 1., 2.])
- >>> layer(input)
- <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
- array([[0, 2, 3, 1],
- [1, 3, 2, 1]])>
-
- Bucketize float values based on a number of buckets to compute.
- >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
- >>> layer = tf.keras.layers.experimental.preprocessing.Discretization(
- ... num_bins=4, epsilon=0.01)
- >>> layer.adapt(input)
- >>> layer(input)
- <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
- array([[0, 2, 3, 2],
- [1, 3, 3, 1]])>
- """
-
- def __init__(self,
- bin_boundaries=None,
- num_bins=None,
- epsilon=0.01,
- **kwargs):
- # bins is a deprecated arg for setting bin_boundaries or num_bins that still
- # has some usage.
- if "bins" in kwargs:
- logging.warning(
- "bins is deprecated, please use bin_boundaries or num_bins instead.")
- if isinstance(kwargs["bins"], int) and num_bins is None:
- num_bins = kwargs["bins"]
- elif bin_boundaries is None:
- bin_boundaries = kwargs["bins"]
- del kwargs["bins"]
- super().__init__(streaming=True, **kwargs)
- if num_bins is not None and num_bins < 0:
- raise ValueError("`num_bins` must be must be greater than or equal to 0. "
- "You passed `num_bins={}`".format(num_bins))
- if num_bins is not None and bin_boundaries is not None:
- raise ValueError("Both `num_bins` and `bin_boundaries` should not be "
- "set. You passed `num_bins={}` and "
- "`bin_boundaries={}`".format(num_bins, bin_boundaries))
- bin_boundaries = self._convert_to_list(bin_boundaries)
- self.input_bin_boundaries = bin_boundaries
- self.bin_boundaries = bin_boundaries if bin_boundaries is not None else []
- self.num_bins = num_bins
- self.epsilon = epsilon
-
- def build(self, input_shape):
- super().build(input_shape)
-
- if self.input_bin_boundaries is not None:
- return
-
- # Summary contains two equal length vectors of bins at index 0 and weights
- # at index 1.
- self.summary = self.add_weight(
- name="summary",
- shape=(2, None),
- dtype=dtypes.float32,
- initializer=lambda shape, dtype: [[], []], # pylint: disable=unused-arguments
- trainable=False)
-
- def update_state(self, data):
- if self.input_bin_boundaries is not None:
- raise ValueError(
- "Cannot adapt a Discretization layer that has been initialized with "
- "`bin_boundaries`, use `num_bins` instead. You passed "
- "`bin_boundaries={}`.".format(self.input_bin_boundaries))
-
- if not self.built:
- raise RuntimeError("`build` must be called before `update_state`.")
-
- data = ops.convert_to_tensor_v2_with_dispatch(data)
- if data.dtype != dtypes.float32:
- data = math_ops.cast(data, dtypes.float32)
- summary = summarize(data, self.epsilon)
- self.summary.assign(merge_summaries(summary, self.summary, self.epsilon))
-
- def merge_state(self, layers):
- for l in layers + [self]:
- if l.input_bin_boundaries is not None:
- raise ValueError(
- "Cannot merge Discretization layer {} that has been initialized "
- "with `bin_boundaries`, use `num_bins` instead. You passed "
- "`bin_boundaries={}`.".format(l.name, l.input_bin_boundaries))
- if not l.built:
- raise ValueError(
- "Cannot merge Discretization layer {}, it has no state. You need "
- "to call `adapt` on this layer before merging.".format(l.name))
-
- summary = self.summary
- for l in layers:
- summary = merge_summaries(summary, l.summary, self.epsilon)
- self.summary.assign(summary)
- self.finalize_state()
-
- def finalize_state(self):
- if self.input_bin_boundaries is not None or not self.built:
- return
-
- # The bucketize op only support list boundaries.
- self.bin_boundaries = self._convert_to_list(
- get_bin_boundaries(self.summary, self.num_bins))
-
- def reset_state(self): # pylint: disable=method-hidden
- if self.input_bin_boundaries is not None or not self.built:
- return
-
- self.summary.assign([[], []])
-
- def get_config(self):
- config = super().get_config()
- config.update({
- "bin_boundaries": self.input_bin_boundaries,
- "num_bins": self.num_bins,
- "epsilon": self.epsilon,
- })
- return config
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def compute_output_signature(self, input_spec):
- output_shape = self.compute_output_shape(input_spec.shape.as_list())
- output_dtype = dtypes.int64
- if isinstance(input_spec, sparse_tensor.SparseTensorSpec):
- return sparse_tensor.SparseTensorSpec(
- shape=output_shape, dtype=output_dtype)
- return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
-
- def call(self, inputs):
- def bucketize(inputs):
- return gen_math_ops.Bucketize(
- input=inputs, boundaries=self.bin_boundaries)
-
- if tf_utils.is_ragged(inputs):
- integer_buckets = ragged_functional_ops.map_flat_values(bucketize, inputs)
- # Ragged map_flat_values doesn't touch the non-values tensors in the
- # ragged composite tensor. If this op is the only op a Keras model,
- # this can cause errors in Graph mode, so wrap the tensor in an identity.
- return array_ops.identity(integer_buckets)
- elif tf_utils.is_sparse(inputs):
- return sparse_tensor.SparseTensor(
- indices=array_ops.identity(inputs.indices),
- values=bucketize(inputs.values),
- dense_shape=array_ops.identity(inputs.dense_shape))
- else:
- return bucketize(inputs)
-
- def _convert_to_list(self, inputs):
- if tensor_util.is_tensor(inputs):
- inputs = inputs.numpy()
- if isinstance(inputs, (np.ndarray)):
- inputs = inputs.tolist()
- inputs = list(inputs)
- return inputs
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
deleted file mode 100644
index a040ffc..0000000
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.discretization."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.framework import config
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import strategy_combinations
-from tensorflow.python.keras.layers.preprocessing import discretization
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-
-
-@ds_combinations.generate(
- combinations.combine(
- strategy=strategy_combinations.all_strategies +
- strategy_combinations.multi_worker_mirrored_strategies,
- mode=["eager", "graph"]))
-class DiscretizationDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_distribution(self, strategy):
- input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-
- expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
- expected_output_shape = [None, 4]
-
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(4,))
- layer = discretization.Discretization(bin_boundaries=[0., 1., 2.])
- bucket_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-if __name__ == "__main__":
- v2_compat.enable_v2_behavior()
- multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
deleted file mode 100644
index 9680dfb..0000000
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras discretization preprocessing layer."""
-
-import os
-
-from absl.testing import parameterized
-
-import numpy as np
-
-from tensorflow.python import keras
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers.preprocessing import discretization
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import load
-from tensorflow.python.saved_model import save
-
-
-@keras_parameterized.run_all_keras_modes
-class DiscretizationTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_bucketize_with_explicit_buckets_integer(self):
- input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-
- expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
- expected_output_shape = [None, 4]
-
- input_data = keras.Input(shape=(4,))
- layer = discretization.Discretization(bin_boundaries=[0., 1., 2.])
- bucket_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bucketize_with_explicit_buckets_int_input(self):
- input_array = np.array([[-1, 1, 3, 0], [0, 3, 1, 0]], dtype=np.int64)
-
- expected_output = [[0, 2, 3, 1], [1, 3, 2, 1]]
- expected_output_shape = [None, 4]
-
- input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
- layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
- bucket_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bucketize_with_explicit_buckets_sparse_float_input(self):
- indices = [[0, 1], [0, 2], [1, 1]]
- input_array = sparse_tensor.SparseTensor(
- indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3])
- expected_output = [0, 2, 3]
- input_data = keras.Input(shape=(3,), dtype=dtypes.float32, sparse=True)
- layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
- bucket_data = layer(input_data)
-
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(indices, output_dataset.indices)
- self.assertAllEqual(expected_output, output_dataset.values)
-
- def test_bucketize_with_explicit_buckets_ragged_float_input(self):
- input_array = ragged_factory_ops.constant([[-1.5, 1.0, 3.4, .5],
- [0.0, 3.0, 1.3]])
-
- expected_output = [[0, 2, 3, 1], [1, 3, 2]]
- expected_output_shape = [None, None]
-
- input_data = keras.Input(shape=(None,), ragged=True)
- layer = discretization.Discretization(bin_boundaries=[0., 1., 2.])
- bucket_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bucketize_with_explicit_buckets_ragged_int_input(self):
- input_array = ragged_factory_ops.constant([[-1, 1, 3, 0], [0, 3, 1]],
- dtype=dtypes.int64)
-
- expected_output = [[0, 2, 3, 1], [1, 3, 2]]
- expected_output_shape = [None, None]
-
- input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
- layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
- bucket_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bucketize_with_explicit_buckets_sparse_int_input(self):
- indices = [[0, 1], [0, 2], [1, 1]]
- input_array = sparse_tensor.SparseTensor(
- indices=indices, values=[-1, 1, 3], dense_shape=[2, 3])
- expected_output = [0, 2, 3]
- input_data = keras.Input(shape=(3,), dtype=dtypes.int32, sparse=True)
- layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
- bucket_data = layer(input_data)
-
- model = keras.Model(inputs=input_data, outputs=bucket_data)
- output_dataset = model.predict(input_array, steps=1)
- self.assertAllEqual(indices, output_dataset.indices)
- self.assertAllEqual(expected_output, output_dataset.values)
-
- def test_output_shape(self):
- input_data = keras.Input(batch_size=16, shape=(4,), dtype=dtypes.int64)
- layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
- output = layer(input_data)
- self.assertAllEqual(output.shape.as_list(), [16, 4])
-
- def test_num_bins_negative_fails(self):
- with self.assertRaisesRegex(ValueError, "`num_bins` must be.*num_bins=-7"):
- _ = discretization.Discretization(num_bins=-7)
-
- def test_num_bins_and_bins_set_fails(self):
- with self.assertRaisesRegex(
- ValueError,
- r"`num_bins` and `bin_boundaries` should not be set.*5.*\[1, 2\]"):
- _ = discretization.Discretization(num_bins=5, bins=[1, 2])
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class DiscretizationAdaptTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters([
- {
- "testcase_name": "2d_single_element",
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]]),
- "test_data": np.array([[1.], [2.], [3.]]),
- "use_dataset": True,
- "expected": np.array([[1], [2], [3]]),
- "num_bins": 5,
- "epsilon": 0.01
- }, {
- "testcase_name": "2d_multi_element",
- "adapt_data": np.array([[1., 6.], [2., 7.], [3., 8.], [4., 9.],
- [5., 10.]]),
- "test_data": np.array([[1., 10.], [2., 6.], [3., 8.]]),
- "use_dataset": True,
- "expected": np.array([[0, 4], [1, 3], [1, 4]]),
- "num_bins": 5,
- "epsilon": 0.01
- }, {
- "testcase_name": "1d_single_element",
- "adapt_data": np.array([3., 2., 1., 5., 4.]),
- "test_data": np.array([1., 2., 3.]),
- "use_dataset": True,
- "expected": np.array([1, 2, 3]),
- "num_bins": 5,
- "epsilon": 0.01
- }, {
- "testcase_name": "300_batch_1d_single_element_1",
- "adapt_data": np.arange(300),
- "test_data": np.arange(300),
- "use_dataset": True,
- "expected":
- np.concatenate([np.zeros(101), np.ones(99), 2 * np.ones(100)]),
- "num_bins": 3,
- "epsilon": 0.01
- }, {
- "testcase_name": "300_batch_1d_single_element_2",
- "adapt_data": np.arange(300) ** 2,
- "test_data": np.arange(300) ** 2,
- "use_dataset": True,
- "expected":
- np.concatenate([np.zeros(101), np.ones(99), 2 * np.ones(100)]),
- "num_bins": 3,
- "epsilon": 0.01
- }, {
- "testcase_name": "300_batch_1d_single_element_large_epsilon",
- "adapt_data": np.arange(300),
- "test_data": np.arange(300),
- "use_dataset": True,
- "expected": np.concatenate([np.zeros(136), np.ones(164)]),
- "num_bins": 2,
- "epsilon": 0.1
- }])
- def test_layer_computation(self, adapt_data, test_data, use_dataset,
- expected, num_bins=5, epsilon=0.01):
-
- input_shape = tuple(list(test_data.shape)[1:])
- np.random.shuffle(adapt_data)
- if use_dataset:
- # Keras APIs expect batched datasets
- adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
- test_data.shape[0] // 2)
- test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
- test_data.shape[0] // 2)
-
- layer = discretization.Discretization(epsilon=epsilon, num_bins=num_bins)
- layer.adapt(adapt_data)
-
- input_data = keras.Input(shape=input_shape)
- output = layer(input_data)
- model = keras.Model(input_data, output)
- model._run_eagerly = testing_utils.should_run_eagerly()
- output_data = model.predict(test_data)
- self.assertAllClose(expected, output_data)
-
- def test_merge_state(self):
- data = np.arange(300)
- partial_ds_1 = dataset_ops.Dataset.from_tensor_slices(data[:100])
- partial_ds_2 = dataset_ops.Dataset.from_tensor_slices(data[100:200])
- partial_ds_3 = dataset_ops.Dataset.from_tensor_slices(data[200:])
- full_ds = partial_ds_1.concatenate(partial_ds_2).concatenate(partial_ds_3)
-
- # Use a higher epsilon to avoid any discrepencies from the quantile
- # approximation.
- full_layer = discretization.Discretization(num_bins=3, epsilon=0.001)
- full_layer.adapt(full_ds.batch(2))
-
- partial_layer_1 = discretization.Discretization(num_bins=3, epsilon=0.001)
- partial_layer_1.adapt(partial_ds_1.batch(2))
- partial_layer_2 = discretization.Discretization(num_bins=3, epsilon=0.001)
- partial_layer_2.adapt(partial_ds_2.batch(2))
- partial_layer_3 = discretization.Discretization(num_bins=3, epsilon=0.001)
- partial_layer_3.adapt(partial_ds_3.batch(2))
- partial_layer_1.merge_state([partial_layer_2, partial_layer_3])
- merged_layer = partial_layer_1
-
- data = np.arange(300)
- self.assertAllClose(full_layer(data), merged_layer(data))
-
- def test_merge_with_stateless_layers_fails(self):
- layer1 = discretization.Discretization(num_bins=2, name="layer1")
- layer1.adapt([1, 2, 3])
- layer2 = discretization.Discretization(bin_boundaries=[0, 1], name="layer2")
- with self.assertRaisesRegex(ValueError, "Cannot merge.*layer2"):
- layer1.merge_state([layer2])
-
- def test_merge_with_unadapted_layers_fails(self):
- layer1 = discretization.Discretization(num_bins=2, name="layer1")
- layer1.adapt([1, 2, 3])
- layer2 = discretization.Discretization(num_bins=2, name="layer2")
- with self.assertRaisesRegex(ValueError, "Cannot merge.*layer2"):
- layer1.merge_state([layer2])
-
- def test_multiple_adapts(self):
- first_adapt = [[1], [2], [3]]
- second_adapt = [[4], [5], [6]]
- predict_input = [[2], [2]]
- expected_first_output = [[2], [2]]
- expected_second_output = [[0], [0]]
-
- inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
- layer = discretization.Discretization(num_bins=3)
- layer.adapt(first_adapt)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- actual_output = model.predict(predict_input)
- self.assertAllClose(actual_output, expected_first_output)
-
- # Re-adapt the layer on new inputs.
- layer.adapt(second_adapt)
- # Re-compile the model.
- model.compile()
- # `predict` should now use the new model state.
- actual_output = model.predict(predict_input)
- self.assertAllClose(actual_output, expected_second_output)
-
- def test_saved_model_tf(self):
- input_data = [[1], [2], [3]]
- predict_data = [[0.5], [1.5], [2.5]]
- expected_output = [[0], [1], [2]]
-
- inputs = keras.Input(shape=(1,), dtype=dtypes.float32)
- layer = discretization.Discretization(num_bins=3)
- layer.adapt(input_data)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- output_data = model.predict(predict_data)
- self.assertAllClose(output_data, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_saved_model")
- save.save(model, output_path)
- loaded_model = load.load(output_path)
- f = loaded_model.signatures["serving_default"]
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_data = f(constant_op.constant(predict_data))["discretization"]
- self.assertAllClose(new_output_data, expected_output)
-
- def test_saved_model_keras(self):
- input_data = [[1], [2], [3]]
- predict_data = [[0.5], [1.5], [2.5]]
- expected_output = [[0], [1], [2]]
-
- cls = discretization.Discretization
- inputs = keras.Input(shape=(1,), dtype=dtypes.float32)
- layer = cls(num_bins=3)
- layer.adapt(input_data)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- output_data = model.predict(predict_data)
- self.assertAllClose(output_data, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"Discretization": cls})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_data = loaded_model.predict(predict_data)
- self.assertAllClose(new_output_data, expected_output)
-
- def test_saved_weights_keras(self):
- input_data = [[1], [2], [3]]
- predict_data = [[0.5], [1.5], [2.5]]
- expected_output = [[0], [1], [2]]
-
- cls = discretization.Discretization
- inputs = keras.Input(shape=(1,), dtype=dtypes.float32)
- layer = cls(num_bins=3)
- layer.adapt(input_data)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- output_data = model.predict(predict_data)
- self.assertAllClose(output_data, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_weights")
- model.save_weights(output_path, save_format="tf")
- new_model = keras.Model.from_config(
- model.get_config(), custom_objects={"Discretization": cls})
- new_model.load_weights(output_path)
-
- # Validate correctness of the new model.
- new_output_data = new_model.predict(predict_data)
- self.assertAllClose(new_output_data, expected_output)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
deleted file mode 100644
index 1a07b6d..0000000
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras hashing preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import functools
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.util.tf_export import keras_export
-
-# Default key from tf.sparse.cross_hashed
-_DEFAULT_SALT_KEY = [0xDECAFCAFFE, 0xDECAFCAFFE]
-
-
-@keras_export('keras.layers.experimental.preprocessing.Hashing')
-class Hashing(base_layer.Layer):
- """Implements categorical feature hashing, also known as "hashing trick".
-
- This layer transforms single or multiple categorical inputs to hashed output.
- It converts a sequence of int or string to a sequence of int. The stable hash
- function uses tensorflow::ops::Fingerprint to produce universal output that
- is consistent across platforms.
-
- This layer uses [FarmHash64](https://github.com/google/farmhash) by default,
- which provides a consistent hashed output across different platforms and is
- stable across invocations, regardless of device and context, by mixing the
- input bits thoroughly.
-
- If you want to obfuscate the hashed output, you can also pass a random `salt`
- argument in the constructor. In that case, the layer will use the
- [SipHash64](https://github.com/google/highwayhash) hash function, with
- the `salt` value serving as additional input to the hash function.
-
- Example (FarmHash64):
-
- >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3)
- >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
- >>> layer(inp)
- <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
- array([[1],
- [0],
- [1],
- [1],
- [2]])>
-
- Example (FarmHash64) with a mask value:
-
- >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
- ... mask_value='')
- >>> inp = [['A'], ['B'], [''], ['C'], ['D']]
- >>> layer(inp)
- <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
- array([[1],
- [1],
- [0],
- [2],
- [2]])>
-
- Example (SipHash64):
-
- >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
- ... salt=[133, 137])
- >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
- >>> layer(inp)
- <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
- array([[1],
- [2],
- [1],
- [0],
- [2]])>
-
- Example (Siphash64 with a single integer, same as `salt=[133, 133]`
-
- >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
- ... salt=133)
- >>> inp = [['A'], ['B'], ['C'], ['D'], ['E']]
- >>> layer(inp)
- <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
- array([[0],
- [0],
- [2],
- [1],
- [0]])>
-
- Reference: [SipHash with salt](https://www.131002.net/siphash/siphash.pdf)
-
- Args:
- num_bins: Number of hash bins. Note that this includes the `mask_value` bin,
- so the effective number of bins is `(num_bins - 1)` if `mask_value` is
- set.
- mask_value: A value that represents masked inputs, which are mapped to
- index 0. Defaults to None, meaning no mask term will be added and the
- hashing will start at index 0.
- salt: A single unsigned integer or None.
- If passed, the hash function used will be SipHash64, with these values
- used as an additional input (known as a "salt" in cryptography).
- These should be non-zero. Defaults to `None` (in that
- case, the FarmHash64 hash function is used). It also supports
- tuple/list of 2 unsigned integer numbers, see reference paper for details.
- **kwargs: Keyword arguments to construct a layer.
-
- Input shape: A single or list of string, int32 or int64 `Tensor`,
- `SparseTensor` or `RaggedTensor` of shape `[batch_size, ...,]`
-
- Output shape: An int64 `Tensor`, `SparseTensor` or `RaggedTensor` of shape
- `[batch_size, ...]`. If any input is `RaggedTensor` then output is
- `RaggedTensor`, otherwise if any input is `SparseTensor` then output is
- `SparseTensor`, otherwise the output is `Tensor`.
-
- """
-
- def __init__(self, num_bins, mask_value=None, salt=None, **kwargs):
- if num_bins is None or num_bins <= 0:
- raise ValueError('`num_bins` cannot be `None` or non-positive values.')
- super(Hashing, self).__init__(**kwargs)
- self.num_bins = num_bins
- self.mask_value = mask_value
- self.strong_hash = True if salt is not None else False
- if salt is not None:
- if isinstance(salt, (tuple, list)) and len(salt) == 2:
- self.salt = salt
- elif isinstance(salt, int):
- self.salt = [salt, salt]
- else:
- raise ValueError('`salt can only be a tuple of size 2 integers, or a '
- 'single integer, given {}'.format(salt))
- else:
- self.salt = _DEFAULT_SALT_KEY
-
- def _preprocess_single_input(self, inp):
- if isinstance(inp, (list, tuple, np.ndarray)):
- inp = ops.convert_to_tensor_v2_with_dispatch(inp)
- return inp
-
- def _preprocess_inputs(self, inputs):
- if isinstance(inputs, (tuple, list)):
- # If any of them is tensor or ndarray, then treat as list
- if any(
- tensor_util.is_tf_type(inp) or isinstance(inp, np.ndarray)
- for inp in inputs):
- return [self._preprocess_single_input(inp) for inp in inputs]
- return self._preprocess_single_input(inputs)
-
- def call(self, inputs):
- inputs = self._preprocess_inputs(inputs)
- if isinstance(inputs, sparse_tensor.SparseTensor):
- return sparse_tensor.SparseTensor(
- indices=inputs.indices,
- values=self._hash_values_to_bins(inputs.values),
- dense_shape=inputs.dense_shape)
- return self._hash_values_to_bins(inputs)
-
- def _hash_values_to_bins(self, values):
- """Converts a non-sparse tensor of values to bin indices."""
- str_to_hash_bucket = self._get_string_to_hash_bucket_fn()
- num_available_bins = self.num_bins
- mask = None
- # If mask_value is set, the zeroth bin is reserved for it.
- if self.mask_value is not None and num_available_bins > 1:
- num_available_bins -= 1
- mask = math_ops.equal(values, self.mask_value)
- # Convert all values to strings before hashing.
- if values.dtype.is_integer:
- values = string_ops.as_string(values)
- values = str_to_hash_bucket(values, num_available_bins, name='hash')
- if mask is not None:
- values = math_ops.add(values, array_ops.ones_like(values))
- values = array_ops.where(mask, array_ops.zeros_like(values), values)
- return values
-
- def _get_string_to_hash_bucket_fn(self):
- """Returns the string_to_hash_bucket op to use based on `hasher_key`."""
- # string_to_hash_bucket_fast uses FarmHash64 as hash function.
- if not self.strong_hash:
- return string_ops.string_to_hash_bucket_fast
- # string_to_hash_bucket_strong uses SipHash64 as hash function.
- else:
- return functools.partial(
- string_ops.string_to_hash_bucket_strong, key=self.salt)
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def compute_output_signature(self, input_spec):
- output_shape = self.compute_output_shape(input_spec.shape)
- output_dtype = dtypes.int64
- if isinstance(input_spec, sparse_tensor.SparseTensorSpec):
- return sparse_tensor.SparseTensorSpec(
- shape=output_shape, dtype=output_dtype)
- else:
- return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
-
- def get_config(self):
- config = {
- 'num_bins': self.num_bins,
- 'salt': self.salt,
- 'mask_value': self.mask_value,
- }
- base_config = super(Hashing, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
deleted file mode 100644
index d619b14..0000000
--- a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for keras.layers.preprocessing.hashing."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.framework import config
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute.strategy_combinations import all_strategies
-from tensorflow.python.keras.layers.preprocessing import hashing
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.platform import test
-
-
-@ds_combinations.generate(
- combinations.combine(
- distribution=all_strategies,
- mode=["eager", "graph"]))
-class HashingDistributionTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_distribution(self, distribution):
- input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]])
- input_dataset = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
- 2, drop_remainder=True)
- expected_output = [[0], [0], [1], [0]]
-
- config.set_soft_device_placement(True)
-
- with distribution.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = hashing.Hashing(num_bins=2)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
deleted file mode 100644
index 351160b..0000000
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for hashing layer."""
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers.preprocessing import hashing
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import test
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class HashingTest(keras_parameterized.TestCase):
-
- def test_hash_single_bin(self):
- layer = hashing.Hashing(num_bins=1)
- inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
- output = layer(inp)
- self.assertAllClose([[0], [0], [0], [0], [0]], output)
-
- def test_hash_dense_input_farmhash(self):
- layer = hashing.Hashing(num_bins=2)
- inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
- ['skywalker']])
- output = layer(inp)
- # Assert equal for hashed output that should be true on all platforms.
- self.assertAllClose([[0], [0], [1], [0], [0]], output)
-
- def test_hash_dense_input_mask_value_farmhash(self):
- empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
- omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
- inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
- ['skywalker']])
- empty_mask_output = empty_mask_layer(inp)
- omar_mask_output = omar_mask_layer(inp)
- # Outputs should be one more than test_hash_dense_input_farmhash (the zeroth
- # bin is now reserved for masks).
- self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output)
- # 'omar' should map to 0.
- self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
-
- def test_hash_dense_list_input_farmhash(self):
- layer = hashing.Hashing(num_bins=2)
- inp = [['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]
- output = layer(inp)
- # Assert equal for hashed output that should be true on all platforms.
- self.assertAllClose([[0], [0], [1], [0], [0]], output)
-
- inp = ['omar', 'stringer', 'marlo', 'wire', 'skywalker']
- output = layer(inp)
- # Assert equal for hashed output that should be true on all platforms.
- self.assertAllClose([0, 0, 1, 0, 0], output)
-
- def test_hash_dense_int_input_farmhash(self):
- layer = hashing.Hashing(num_bins=3)
- inp = np.asarray([[0], [1], [2], [3], [4]])
- output = layer(inp)
- # Assert equal for hashed output that should be true on all platforms.
- self.assertAllClose([[1], [0], [1], [0], [2]], output)
-
- def test_hash_dense_input_siphash(self):
- layer = hashing.Hashing(num_bins=2, salt=[133, 137])
- inp = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
- ['skywalker']])
- output = layer(inp)
- # Assert equal for hashed output that should be true on all platforms.
- # Note the result is different from FarmHash.
- self.assertAllClose([[0], [1], [0], [1], [0]], output)
-
- layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
- output_2 = layer_2(inp)
- # Note the result is different from (133, 137).
- self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
-
- def test_hash_dense_int_input_siphash(self):
- layer = hashing.Hashing(num_bins=3, salt=[133, 137])
- inp = np.asarray([[0], [1], [2], [3], [4]])
- output = layer(inp)
- # Assert equal for hashed output that should be true on all platforms.
- self.assertAllClose([[1], [1], [2], [0], [1]], output)
-
- def test_hash_sparse_input_farmhash(self):
- layer = hashing.Hashing(num_bins=2)
- indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
- inp = sparse_tensor.SparseTensor(
- indices=indices,
- values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
- dense_shape=[3, 2])
- output = layer(inp)
- self.assertAllClose(indices, output.indices)
- self.assertAllClose([0, 0, 1, 0, 0], output.values)
-
- def test_hash_sparse_input_mask_value_farmhash(self):
- empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
- omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
- indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
- inp = sparse_tensor.SparseTensor(
- indices=indices,
- values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
- dense_shape=[3, 2])
- empty_mask_output = empty_mask_layer(inp)
- omar_mask_output = omar_mask_layer(inp)
- self.assertAllClose(indices, omar_mask_output.indices)
- self.assertAllClose(indices, empty_mask_output.indices)
- # Outputs should be one more than test_hash_sparse_input_farmhash (the
- # zeroth bin is now reserved for masks).
- self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values)
- # 'omar' should map to 0.
- self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
-
- def test_hash_sparse_int_input_farmhash(self):
- layer = hashing.Hashing(num_bins=3)
- indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
- inp = sparse_tensor.SparseTensor(
- indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2])
- output = layer(inp)
- self.assertAllClose(indices, output.indices)
- self.assertAllClose([1, 0, 1, 0, 2], output.values)
-
- def test_hash_sparse_input_siphash(self):
- layer = hashing.Hashing(num_bins=2, salt=[133, 137])
- indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
- inp = sparse_tensor.SparseTensor(
- indices=indices,
- values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
- dense_shape=[3, 2])
- output = layer(inp)
- self.assertAllClose(output.indices, indices)
- # The result should be same with test_hash_dense_input_siphash.
- self.assertAllClose([0, 1, 0, 1, 0], output.values)
-
- layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
- output = layer_2(inp)
- # The result should be same with test_hash_dense_input_siphash.
- self.assertAllClose([1, 0, 1, 0, 1], output.values)
-
- def test_hash_sparse_int_input_siphash(self):
- layer = hashing.Hashing(num_bins=3, salt=[133, 137])
- indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
- inp = sparse_tensor.SparseTensor(
- indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2])
- output = layer(inp)
- self.assertAllClose(indices, output.indices)
- self.assertAllClose([1, 1, 2, 0, 1], output.values)
-
- def test_hash_ragged_string_input_farmhash(self):
- layer = hashing.Hashing(num_bins=2)
- inp_data = ragged_factory_ops.constant(
- [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
- dtype=dtypes.string)
- out_data = layer(inp_data)
- # Same hashed output as test_hash_sparse_input_farmhash
- expected_output = [[0, 0, 1, 0], [1, 0, 0]]
- self.assertAllEqual(expected_output, out_data)
-
- inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- out_t = layer(inp_t)
- model = training.Model(inputs=inp_t, outputs=out_t)
- self.assertAllClose(out_data, model.predict(inp_data))
-
- def test_hash_ragged_input_mask_value(self):
- empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
- omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
- inp_data = ragged_factory_ops.constant(
- [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
- dtype=dtypes.string)
- empty_mask_output = empty_mask_layer(inp_data)
- omar_mask_output = omar_mask_layer(inp_data)
- # Outputs should be one more than test_hash_ragged_string_input_farmhash
- # (the zeroth bin is now reserved for masks).
- expected_output = [[1, 1, 2, 1], [2, 1, 1]]
- self.assertAllClose(expected_output, empty_mask_output)
- # 'omar' should map to 0.
- expected_output = [[0, 1, 2, 1], [2, 1, 1]]
- self.assertAllClose(expected_output, omar_mask_output)
-
- def test_hash_ragged_int_input_farmhash(self):
- layer = hashing.Hashing(num_bins=3)
- inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
- dtype=dtypes.int64)
- out_data = layer(inp_data)
- # Same hashed output as test_hash_sparse_input_farmhash
- expected_output = [[1, 0, 0, 2], [1, 0, 1]]
- self.assertAllEqual(expected_output, out_data)
-
- inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
- out_t = layer(inp_t)
- model = training.Model(inputs=inp_t, outputs=out_t)
- self.assertAllClose(out_data, model.predict(inp_data))
-
- def test_hash_ragged_string_input_siphash(self):
- layer = hashing.Hashing(num_bins=2, salt=[133, 137])
- inp_data = ragged_factory_ops.constant(
- [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
- dtype=dtypes.string)
- out_data = layer(inp_data)
- # Same hashed output as test_hash_dense_input_siphash
- expected_output = [[0, 1, 0, 1], [0, 0, 1]]
- self.assertAllEqual(expected_output, out_data)
-
- inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- out_t = layer(inp_t)
- model = training.Model(inputs=inp_t, outputs=out_t)
- self.assertAllClose(out_data, model.predict(inp_data))
-
- layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
- out_data = layer_2(inp_data)
- expected_output = [[1, 0, 1, 0], [1, 1, 0]]
- self.assertAllEqual(expected_output, out_data)
-
- out_t = layer_2(inp_t)
- model = training.Model(inputs=inp_t, outputs=out_t)
- self.assertAllClose(out_data, model.predict(inp_data))
-
- def test_hash_ragged_int_input_siphash(self):
- layer = hashing.Hashing(num_bins=3, salt=[133, 137])
- inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
- dtype=dtypes.int64)
- out_data = layer(inp_data)
- # Same hashed output as test_hash_sparse_input_farmhash
- expected_output = [[1, 1, 0, 1], [2, 1, 1]]
- self.assertAllEqual(expected_output, out_data)
-
- inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
- out_t = layer(inp_t)
- model = training.Model(inputs=inp_t, outputs=out_t)
- self.assertAllClose(out_data, model.predict(inp_data))
-
- def test_invalid_inputs(self):
- with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
- _ = hashing.Hashing(num_bins=None)
- with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
- _ = hashing.Hashing(num_bins=-1)
- with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
- _ = hashing.Hashing(num_bins=2, salt='string')
- with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
- _ = hashing.Hashing(num_bins=2, salt=[1])
- with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
- _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
-
- def test_hash_compute_output_signature(self):
- input_shape = tensor_shape.TensorShape([2, 3])
- input_spec = tensor_spec.TensorSpec(input_shape, dtypes.string)
- layer = hashing.Hashing(num_bins=2)
- output_spec = layer.compute_output_signature(input_spec)
- self.assertEqual(output_spec.shape.dims, input_shape.dims)
- self.assertEqual(output_spec.dtype, dtypes.int64)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = hashing.Hashing(num_bins=2, name='hashing')
- config = layer.get_config()
- layer_1 = hashing.Hashing.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
deleted file mode 100644
index 32e8f39..0000000
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ /dev/null
@@ -1,1318 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras image preprocessing layers."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.keras.engine.input_spec import InputSpec
-from tensorflow.python.keras.preprocessing import image as image_preprocessing
-from tensorflow.python.keras.utils import control_flow_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_image_ops
-from tensorflow.python.ops import image_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import stateful_random_ops
-from tensorflow.python.ops import stateless_random_ops
-from tensorflow.python.util.tf_export import keras_export
-
-ResizeMethod = image_ops.ResizeMethod
-
-_RESIZE_METHODS = {
- 'bilinear': ResizeMethod.BILINEAR,
- 'nearest': ResizeMethod.NEAREST_NEIGHBOR,
- 'bicubic': ResizeMethod.BICUBIC,
- 'area': ResizeMethod.AREA,
- 'lanczos3': ResizeMethod.LANCZOS3,
- 'lanczos5': ResizeMethod.LANCZOS5,
- 'gaussian': ResizeMethod.GAUSSIAN,
- 'mitchellcubic': ResizeMethod.MITCHELLCUBIC
-}
-
-H_AXIS = 1
-W_AXIS = 2
-
-
-def check_fill_mode_and_interpolation(fill_mode, interpolation):
- if fill_mode not in {'reflect', 'wrap', 'constant', 'nearest'}:
- raise NotImplementedError(
- 'Unknown `fill_mode` {}. Only `reflect`, `wrap`, '
- '`constant` and `nearest` are supported.'.format(fill_mode))
- if interpolation not in {'nearest', 'bilinear'}:
- raise NotImplementedError('Unknown `interpolation` {}. Only `nearest` and '
- '`bilinear` are supported.'.format(interpolation))
-
-
-@keras_export('keras.layers.experimental.preprocessing.Resizing')
-class Resizing(base_layer.Layer):
- """Image resizing layer.
-
- Resize the batched image input to target height and width. The input should
- be a 4-D tensor in the format of NHWC.
-
- Args:
- height: Integer, the height of the output shape.
- width: Integer, the width of the output shape.
- interpolation: String, the interpolation method. Defaults to `bilinear`.
- Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
- `gaussian`, `mitchellcubic`
- crop_to_aspect_ratio: If True, resize the images without aspect
- ratio distortion. When the original aspect ratio differs from the target
- aspect ratio, the output image will be cropped so as to return the largest
- possible window in the image (of size `(height, width)`) that matches
- the target aspect ratio. By default (`crop_to_aspect_ratio=False`),
- aspect ratio may not be preserved.
- """
-
- def __init__(self,
- height,
- width,
- interpolation='bilinear',
- crop_to_aspect_ratio=False,
- **kwargs):
- self.target_height = height
- self.target_width = width
- self.interpolation = interpolation
- self.crop_to_aspect_ratio = crop_to_aspect_ratio
- self._interpolation_method = get_interpolation(interpolation)
- self.input_spec = InputSpec(ndim=4)
- super(Resizing, self).__init__(**kwargs)
-
- def call(self, inputs):
- if self.crop_to_aspect_ratio:
- outputs = image_preprocessing.smart_resize(
- inputs,
- size=[self.target_height, self.target_width],
- interpolation=self._interpolation_method)
- else:
- outputs = image_ops.resize_images_v2(
- inputs,
- size=[self.target_height, self.target_width],
- method=self._interpolation_method)
- return outputs
-
- def compute_output_shape(self, input_shape):
- input_shape = tensor_shape.TensorShape(input_shape).as_list()
- return tensor_shape.TensorShape(
- [input_shape[0], self.target_height, self.target_width, input_shape[3]])
-
- def get_config(self):
- config = {
- 'height': self.target_height,
- 'width': self.target_width,
- 'interpolation': self.interpolation,
- 'crop_to_aspect_ratio': self.crop_to_aspect_ratio,
- }
- base_config = super(Resizing, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.experimental.preprocessing.CenterCrop')
-class CenterCrop(base_layer.Layer):
- """Crop the central portion of the images to target height and width.
-
- Input shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Output shape:
- 4D tensor with shape:
- `(samples, target_height, target_width, channels)`.
-
- If the input height/width is even and the target height/width is odd (or
- inversely), the input image is left-padded by 1 pixel.
-
- Args:
- height: Integer, the height of the output shape.
- width: Integer, the width of the output shape.
- """
-
- def __init__(self, height, width, **kwargs):
- self.target_height = height
- self.target_width = width
- self.input_spec = InputSpec(ndim=4)
- super(CenterCrop, self).__init__(**kwargs)
-
- def call(self, inputs):
- inputs_shape = array_ops.shape(inputs)
- img_hd = inputs_shape[H_AXIS]
- img_wd = inputs_shape[W_AXIS]
- img_hd_diff = img_hd - self.target_height
- img_wd_diff = img_wd - self.target_width
- checks = []
- checks.append(
- check_ops.assert_non_negative(
- img_hd_diff,
- message='The crop height {} should not be greater than input '
- 'height.'.format(self.target_height)))
- checks.append(
- check_ops.assert_non_negative(
- img_wd_diff,
- message='The crop width {} should not be greater than input '
- 'width.'.format(self.target_width)))
- with ops.control_dependencies(checks):
- bbox_h_start = math_ops.cast(img_hd_diff / 2, dtypes.int32)
- bbox_w_start = math_ops.cast(img_wd_diff / 2, dtypes.int32)
- bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
- bbox_size = array_ops.stack(
- [-1, self.target_height, self.target_width, -1])
- outputs = array_ops.slice(inputs, bbox_begin, bbox_size)
- return outputs
-
- def compute_output_shape(self, input_shape):
- input_shape = tensor_shape.TensorShape(input_shape).as_list()
- return tensor_shape.TensorShape(
- [input_shape[0], self.target_height, self.target_width, input_shape[3]])
-
- def get_config(self):
- config = {
- 'height': self.target_height,
- 'width': self.target_width,
- }
- base_config = super(CenterCrop, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomCrop')
-class RandomCrop(base_layer.Layer):
- """Randomly crop the images to target height and width.
-
- This layer will crop all the images in the same batch to the same cropping
- location.
- By default, random cropping is only applied during training. At inference
- time, the images will be first rescaled to preserve the shorter side, and
- center cropped. If you need to apply random cropping at inference time,
- set `training` to True when calling the layer.
-
- Input shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Output shape:
- 4D tensor with shape:
- `(samples, target_height, target_width, channels)`.
-
- Args:
- height: Integer, the height of the output shape.
- width: Integer, the width of the output shape.
- seed: Integer. Used to create a random seed.
- """
-
- def __init__(self, height, width, seed=None, **kwargs):
- self.height = height
- self.width = width
- self.seed = seed
- self._rng = make_generator(self.seed)
- self.input_spec = InputSpec(ndim=4)
- super(RandomCrop, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_cropped_inputs():
- """Cropped inputs with stateless random ops."""
- input_shape = array_ops.shape(inputs)
- crop_size = array_ops.stack(
- [input_shape[0], self.height, self.width, input_shape[3]])
- check = control_flow_ops.Assert(
- math_ops.reduce_all(input_shape >= crop_size),
- [self.height, self.width])
- with ops.control_dependencies([check]):
- limit = input_shape - crop_size + 1
- offset = stateless_random_ops.stateless_random_uniform(
- array_ops.shape(input_shape),
- dtype=crop_size.dtype,
- maxval=crop_size.dtype.max,
- seed=self._rng.make_seeds()[:, 0]) % limit
- return array_ops.slice(inputs, offset, crop_size)
-
- # TODO(b/143885775): Share logic with Resize and CenterCrop.
- def resize_and_center_cropped_inputs():
- """Deterministically resize to shorter side and center crop."""
- input_shape = array_ops.shape(inputs)
- input_height_t = input_shape[H_AXIS]
- input_width_t = input_shape[W_AXIS]
- ratio_cond = (input_height_t / input_width_t > (self.height / self.width))
- # pylint: disable=g-long-lambda
- resized_height = control_flow_util.smart_cond(
- ratio_cond,
- lambda: math_ops.cast(self.width * input_height_t / input_width_t,
- input_height_t.dtype), lambda: self.height)
- resized_width = control_flow_util.smart_cond(
- ratio_cond, lambda: self.width,
- lambda: math_ops.cast(self.height * input_width_t / input_height_t,
- input_width_t.dtype))
- # pylint: enable=g-long-lambda
- resized_inputs = image_ops.resize_images_v2(
- images=inputs, size=array_ops.stack([resized_height, resized_width]))
-
- img_hd_diff = resized_height - self.height
- img_wd_diff = resized_width - self.width
- bbox_h_start = math_ops.cast(img_hd_diff / 2, dtypes.int32)
- bbox_w_start = math_ops.cast(img_wd_diff / 2, dtypes.int32)
- bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
- bbox_size = array_ops.stack([-1, self.height, self.width, -1])
- outputs = array_ops.slice(resized_inputs, bbox_begin, bbox_size)
- return outputs
-
- output = control_flow_util.smart_cond(training, random_cropped_inputs,
- resize_and_center_cropped_inputs)
- original_shape = inputs.shape.as_list()
- batch_size, num_channels = original_shape[0], original_shape[3]
- output_shape = [batch_size] + [self.height, self.width] + [num_channels]
- output.set_shape(output_shape)
- return output
-
- def compute_output_shape(self, input_shape):
- input_shape = tensor_shape.TensorShape(input_shape).as_list()
- return tensor_shape.TensorShape(
- [input_shape[0], self.height, self.width, input_shape[3]])
-
- def get_config(self):
- config = {
- 'height': self.height,
- 'width': self.width,
- 'seed': self.seed,
- }
- base_config = super(RandomCrop, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.experimental.preprocessing.Rescaling')
-class Rescaling(base_layer.Layer):
- """Multiply inputs by `scale` and adds `offset`.
-
- For instance:
-
- 1. To rescale an input in the `[0, 255]` range
- to be in the `[0, 1]` range, you would pass `scale=1./255`.
-
- 2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
- you would pass `scale=1./127.5, offset=-1`.
-
- The rescaling is applied both during training and inference.
-
- Input shape:
- Arbitrary.
-
- Output shape:
- Same as input.
-
- Args:
- scale: Float, the scale to apply to the inputs.
- offset: Float, the offset to apply to the inputs.
- """
-
- def __init__(self, scale, offset=0., **kwargs):
- self.scale = scale
- self.offset = offset
- super(Rescaling, self).__init__(**kwargs)
-
- def call(self, inputs):
- dtype = self._compute_dtype
- scale = math_ops.cast(self.scale, dtype)
- offset = math_ops.cast(self.offset, dtype)
- return math_ops.cast(inputs, dtype) * scale + offset
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def get_config(self):
- config = {
- 'scale': self.scale,
- 'offset': self.offset,
- }
- base_config = super(Rescaling, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-HORIZONTAL = 'horizontal'
-VERTICAL = 'vertical'
-HORIZONTAL_AND_VERTICAL = 'horizontal_and_vertical'
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomFlip')
-class RandomFlip(base_layer.Layer):
- """Randomly flip each image horizontally and vertically.
-
- This layer will flip the images based on the `mode` attribute.
- During inference time, the output will be identical to input. Call the layer
- with `training=True` to flip the input.
-
- Input shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Output shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Attributes:
- mode: String indicating which flip mode to use. Can be "horizontal",
- "vertical", or "horizontal_and_vertical". Defaults to
- "horizontal_and_vertical". "horizontal" is a left-right flip and
- "vertical" is a top-bottom flip.
- seed: Integer. Used to create a random seed.
- """
-
- def __init__(self,
- mode=HORIZONTAL_AND_VERTICAL,
- seed=None,
- **kwargs):
- super(RandomFlip, self).__init__(**kwargs)
- self.mode = mode
- if mode == HORIZONTAL:
- self.horizontal = True
- self.vertical = False
- elif mode == VERTICAL:
- self.horizontal = False
- self.vertical = True
- elif mode == HORIZONTAL_AND_VERTICAL:
- self.horizontal = True
- self.vertical = True
- else:
- raise ValueError('RandomFlip layer {name} received an unknown mode '
- 'argument {arg}'.format(name=self.name, arg=mode))
- self.seed = seed
- self._rng = make_generator(self.seed)
- self.input_spec = InputSpec(ndim=4)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_flipped_inputs():
- flipped_outputs = inputs
- if self.horizontal:
- flipped_outputs = image_ops.stateless_random_flip_left_right(
- flipped_outputs,
- self._rng.make_seeds()[:, 0])
- if self.vertical:
- flipped_outputs = image_ops.stateless_random_flip_up_down(
- flipped_outputs,
- self._rng.make_seeds()[:, 0])
- return flipped_outputs
-
- output = control_flow_util.smart_cond(training, random_flipped_inputs,
- lambda: inputs)
- output.set_shape(inputs.shape)
- return output
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def get_config(self):
- config = {
- 'mode': self.mode,
- 'seed': self.seed,
- }
- base_config = super(RandomFlip, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-# TODO(tanzheny): Add examples, here and everywhere.
-@keras_export('keras.layers.experimental.preprocessing.RandomTranslation')
-class RandomTranslation(base_layer.Layer):
- """Randomly translate each image during training.
-
- Args:
- height_factor: a float represented as fraction of value, or a tuple of size
- 2 representing lower and upper bound for shifting vertically. A negative
- value means shifting image up, while a positive value means shifting image
- down. When represented as a single positive float, this value is used for
- both the upper and lower bound. For instance, `height_factor=(-0.2, 0.3)`
- results in an output shifted by a random amount in the range [-20%, +30%].
- `height_factor=0.2` results in an output height shifted by a random amount
- in the range [-20%, +20%].
- width_factor: a float represented as fraction of value, or a tuple of size 2
- representing lower and upper bound for shifting horizontally. A negative
- value means shifting image left, while a positive value means shifting
- image right. When represented as a single positive float, this value is
- used for both the upper and lower bound. For instance,
- `width_factor=(-0.2, 0.3)` results in an output shifted left by 20%, and
- shifted right by 30%. `width_factor=0.2` results in an output height
- shifted left or right by 20%.
- fill_mode: Points outside the boundaries of the input are filled according
- to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
- - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
- reflecting about the edge of the last pixel.
- - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
- filling all values beyond the edge with the same constant value k = 0.
- - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
- wrapping around to the opposite edge.
- - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by the
- nearest pixel.
- interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
- seed: Integer. Used to create a random seed.
- fill_value: a float represents the value to be filled outside the boundaries
- when `fill_mode` is "constant".
- Input shape:
- 4D tensor with shape: `(samples, height, width, channels)`,
- data_format='channels_last'.
- Output shape:
- 4D tensor with shape: `(samples, height, width, channels)`,
- data_format='channels_last'.
- Raise:
- ValueError: if either bound is not between [0, 1], or upper bound is less
- than lower bound.
- """
-
- def __init__(self,
- height_factor,
- width_factor,
- fill_mode='reflect',
- interpolation='bilinear',
- seed=None,
- fill_value=0.0,
- **kwargs):
- self.height_factor = height_factor
- if isinstance(height_factor, (tuple, list)):
- self.height_lower = height_factor[0]
- self.height_upper = height_factor[1]
- else:
- self.height_lower = -height_factor
- self.height_upper = height_factor
- if self.height_upper < self.height_lower:
- raise ValueError('`height_factor` cannot have upper bound less than '
- 'lower bound, got {}'.format(height_factor))
- if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
- raise ValueError('`height_factor` must have values between [-1, 1], '
- 'got {}'.format(height_factor))
-
- self.width_factor = width_factor
- if isinstance(width_factor, (tuple, list)):
- self.width_lower = width_factor[0]
- self.width_upper = width_factor[1]
- else:
- self.width_lower = -width_factor
- self.width_upper = width_factor
- if self.width_upper < self.width_lower:
- raise ValueError('`width_factor` cannot have upper bound less than '
- 'lower bound, got {}'.format(width_factor))
- if abs(self.width_lower) > 1. or abs(self.width_upper) > 1.:
- raise ValueError('`width_factor` must have values between [-1, 1], '
- 'got {}'.format(width_factor))
-
- check_fill_mode_and_interpolation(fill_mode, interpolation)
-
- self.fill_mode = fill_mode
- self.fill_value = fill_value
- self.interpolation = interpolation
- self.seed = seed
- self._rng = make_generator(self.seed)
- self.input_spec = InputSpec(ndim=4)
- super(RandomTranslation, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_translated_inputs():
- """Translated inputs with random ops."""
- inputs_shape = array_ops.shape(inputs)
- batch_size = inputs_shape[0]
- h_axis, w_axis = H_AXIS, W_AXIS
- img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
- img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
- height_translate = self._rng.uniform(
- shape=[batch_size, 1],
- minval=self.height_lower,
- maxval=self.height_upper,
- dtype=dtypes.float32)
- height_translate = height_translate * img_hd
- width_translate = self._rng.uniform(
- shape=[batch_size, 1],
- minval=self.width_lower,
- maxval=self.width_upper,
- dtype=dtypes.float32)
- width_translate = width_translate * img_wd
- translations = math_ops.cast(
- array_ops.concat([width_translate, height_translate], axis=1),
- dtype=dtypes.float32)
- return transform(
- inputs,
- get_translation_matrix(translations),
- interpolation=self.interpolation,
- fill_mode=self.fill_mode,
- fill_value=self.fill_value)
-
- output = control_flow_util.smart_cond(training, random_translated_inputs,
- lambda: inputs)
- output.set_shape(inputs.shape)
- return output
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def get_config(self):
- config = {
- 'height_factor': self.height_factor,
- 'width_factor': self.width_factor,
- 'fill_mode': self.fill_mode,
- 'fill_value': self.fill_value,
- 'interpolation': self.interpolation,
- 'seed': self.seed,
- }
- base_config = super(RandomTranslation, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-def get_translation_matrix(translations, name=None):
- """Returns projective transform(s) for the given translation(s).
-
- Args:
- translations: A matrix of 2-element lists representing [dx, dy] to translate
- for each image (for a batch of images).
- name: The name of the op.
-
- Returns:
- A tensor of shape (num_images, 8) projective transforms which can be given
- to `transform`.
- """
- with backend.name_scope(name or 'translation_matrix'):
- num_translations = array_ops.shape(translations)[0]
- # The translation matrix looks like:
- # [[1 0 -dx]
- # [0 1 -dy]
- # [0 0 1]]
- # where the last entry is implicit.
- # Translation matrices are always float32.
- return array_ops.concat(
- values=[
- array_ops.ones((num_translations, 1), dtypes.float32),
- array_ops.zeros((num_translations, 1), dtypes.float32),
- -translations[:, 0, None],
- array_ops.zeros((num_translations, 1), dtypes.float32),
- array_ops.ones((num_translations, 1), dtypes.float32),
- -translations[:, 1, None],
- array_ops.zeros((num_translations, 2), dtypes.float32),
- ],
- axis=1)
-
-
-def transform(images,
- transforms,
- fill_mode='reflect',
- fill_value=0.0,
- interpolation='bilinear',
- output_shape=None,
- name=None):
- """Applies the given transform(s) to the image(s).
-
- Args:
- images: A tensor of shape (num_images, num_rows, num_columns, num_channels)
- (NHWC), (num_rows, num_columns, num_channels) (HWC), or (num_rows,
- num_columns) (HW). The rank must be statically known (the shape is not
- `TensorShape(None)`.
- transforms: Projective transform matrix/matrices. A vector of length 8 or
- tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1, b2,
- c0, c1], then it maps the *output* point `(x, y)` to a transformed *input*
- point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
- `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to the
- transform mapping input points to output points. Note that gradients are
- not backpropagated into transformation parameters.
- fill_mode: Points outside the boundaries of the input are filled according
- to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
- fill_value: a float represents the value to be filled outside the boundaries
- when `fill_mode` is "constant".
- interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
- output_shape: Output dimesion after the transform, [height, width]. If None,
- output is the same size as input image.
- name: The name of the op. ## Fill mode.
- Behavior for each valid value is as follows: reflect (d c b a | a b c d | d c
- b a) The input is extended by reflecting about the edge of the last pixel.
- constant (k k k k | a b c d | k k k k) The input is extended by filling all
- values beyond the edge with the same constant value k = 0. wrap (a b c d |
- a b c d | a b c d) The input is extended by wrapping around to the opposite
- edge. nearest (a a a a | a b c d | d d d d) The input is extended by the
- nearest pixel.
- Input shape:
- 4D tensor with shape: `(samples, height, width, channels)`,
- data_format='channels_last'.
- Output shape:
- 4D tensor with shape: `(samples, height, width, channels)`,
- data_format='channels_last'.
-
- Returns:
- Image(s) with the same type and shape as `images`, with the given
- transform(s) applied. Transformed coordinates outside of the input image
- will be filled with zeros.
-
- Raises:
- TypeError: If `image` is an invalid type.
- ValueError: If output shape is not 1-D int32 Tensor.
- """
- with backend.name_scope(name or 'transform'):
- if output_shape is None:
- output_shape = array_ops.shape(images)[1:3]
- if not context.executing_eagerly():
- output_shape_value = tensor_util.constant_value(output_shape)
- if output_shape_value is not None:
- output_shape = output_shape_value
-
- output_shape = ops.convert_to_tensor_v2_with_dispatch(
- output_shape, dtypes.int32, name='output_shape')
-
- if not output_shape.get_shape().is_compatible_with([2]):
- raise ValueError('output_shape must be a 1-D Tensor of 2 elements: '
- 'new_height, new_width, instead got '
- '{}'.format(output_shape))
-
- fill_value = ops.convert_to_tensor_v2_with_dispatch(
- fill_value, dtypes.float32, name='fill_value')
-
- return gen_image_ops.ImageProjectiveTransformV3(
- images=images,
- output_shape=output_shape,
- fill_value=fill_value,
- transforms=transforms,
- fill_mode=fill_mode.upper(),
- interpolation=interpolation.upper())
-
-
-def get_rotation_matrix(angles, image_height, image_width, name=None):
- """Returns projective transform(s) for the given angle(s).
-
- Args:
- angles: A scalar angle to rotate all images by, or (for batches of images) a
- vector with an angle to rotate each image in the batch. The rank must be
- statically known (the shape is not `TensorShape(None)`).
- image_height: Height of the image(s) to be transformed.
- image_width: Width of the image(s) to be transformed.
- name: The name of the op.
-
- Returns:
- A tensor of shape (num_images, 8). Projective transforms which can be given
- to operation `image_projective_transform_v2`. If one row of transforms is
- [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
- `(x, y)` to a transformed *input* point
- `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
- where `k = c0 x + c1 y + 1`.
- """
- with backend.name_scope(name or 'rotation_matrix'):
- x_offset = ((image_width - 1) - (math_ops.cos(angles) *
- (image_width - 1) - math_ops.sin(angles) *
- (image_height - 1))) / 2.0
- y_offset = ((image_height - 1) - (math_ops.sin(angles) *
- (image_width - 1) + math_ops.cos(angles) *
- (image_height - 1))) / 2.0
- num_angles = array_ops.shape(angles)[0]
- return array_ops.concat(
- values=[
- math_ops.cos(angles)[:, None],
- -math_ops.sin(angles)[:, None],
- x_offset[:, None],
- math_ops.sin(angles)[:, None],
- math_ops.cos(angles)[:, None],
- y_offset[:, None],
- array_ops.zeros((num_angles, 2), dtypes.float32),
- ],
- axis=1)
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomRotation')
-class RandomRotation(base_layer.Layer):
- """Randomly rotate each image.
-
- By default, random rotations are only applied during training.
- At inference time, the layer does nothing. If you need to apply random
- rotations at inference time, set `training` to True when calling the layer.
-
- Input shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Output shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Attributes:
- factor: a float represented as fraction of 2pi, or a tuple of size 2
- representing lower and upper bound for rotating clockwise and
- counter-clockwise. A positive values means rotating counter clock-wise,
- while a negative value means clock-wise. When represented as a single
- float, this value is used for both the upper and lower bound. For
- instance, `factor=(-0.2, 0.3)` results in an output rotation by a random
- amount in the range `[-20% * 2pi, 30% * 2pi]`. `factor=0.2` results in an
- output rotating by a random amount in the range `[-20% * 2pi, 20% * 2pi]`.
- fill_mode: Points outside the boundaries of the input are filled according
- to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
- - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
- reflecting about the edge of the last pixel.
- - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
- filling all values beyond the edge with the same constant value k = 0.
- - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
- wrapping around to the opposite edge.
- - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by the
- nearest pixel.
- interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
- seed: Integer. Used to create a random seed.
- fill_value: a float represents the value to be filled outside the boundaries
- when `fill_mode` is "constant".
- Raise:
- ValueError: if either bound is not between [0, 1], or upper bound is less
- than lower bound.
- """
-
- def __init__(self,
- factor,
- fill_mode='reflect',
- interpolation='bilinear',
- seed=None,
- fill_value=0.0,
- **kwargs):
- self.factor = factor
- if isinstance(factor, (tuple, list)):
- self.lower = factor[0]
- self.upper = factor[1]
- else:
- self.lower = -factor
- self.upper = factor
- if self.upper < self.lower:
- raise ValueError('Factor cannot have negative values, '
- 'got {}'.format(factor))
- check_fill_mode_and_interpolation(fill_mode, interpolation)
- self.fill_mode = fill_mode
- self.fill_value = fill_value
- self.interpolation = interpolation
- self.seed = seed
- self._rng = make_generator(self.seed)
- self.input_spec = InputSpec(ndim=4)
- super(RandomRotation, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_rotated_inputs():
- """Rotated inputs with random ops."""
- inputs_shape = array_ops.shape(inputs)
- batch_size = inputs_shape[0]
- img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
- img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
- min_angle = self.lower * 2. * np.pi
- max_angle = self.upper * 2. * np.pi
- angles = self._rng.uniform(
- shape=[batch_size], minval=min_angle, maxval=max_angle)
- return transform(
- inputs,
- get_rotation_matrix(angles, img_hd, img_wd),
- fill_mode=self.fill_mode,
- fill_value=self.fill_value,
- interpolation=self.interpolation)
-
- output = control_flow_util.smart_cond(training, random_rotated_inputs,
- lambda: inputs)
- output.set_shape(inputs.shape)
- return output
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def get_config(self):
- config = {
- 'factor': self.factor,
- 'fill_mode': self.fill_mode,
- 'fill_value': self.fill_value,
- 'interpolation': self.interpolation,
- 'seed': self.seed,
- }
- base_config = super(RandomRotation, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomZoom')
-class RandomZoom(base_layer.Layer):
- """Randomly zoom each image during training.
-
- Args:
- height_factor: a float represented as fraction of value, or a tuple of size
- 2 representing lower and upper bound for zooming vertically. When
- represented as a single float, this value is used for both the upper and
- lower bound. A positive value means zooming out, while a negative value
- means zooming in. For instance, `height_factor=(0.2, 0.3)` result in an
- output zoomed out by a random amount in the range [+20%, +30%].
- `height_factor=(-0.3, -0.2)` result in an output zoomed in by a random
- amount in the range [+20%, +30%].
- width_factor: a float represented as fraction of value, or a tuple of size 2
- representing lower and upper bound for zooming horizontally. When
- represented as a single float, this value is used for both the upper and
- lower bound. For instance, `width_factor=(0.2, 0.3)` result in an output
- zooming out between 20% to 30%. `width_factor=(-0.3, -0.2)` result in an
- output zooming in between 20% to 30%. Defaults to `None`, i.e., zooming
- vertical and horizontal directions by preserving the aspect ratio.
- fill_mode: Points outside the boundaries of the input are filled according
- to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
- - *reflect*: `(d c b a | a b c d | d c b a)` The input is extended by
- reflecting about the edge of the last pixel.
- - *constant*: `(k k k k | a b c d | k k k k)` The input is extended by
- filling all values beyond the edge with the same constant value k = 0.
- - *wrap*: `(a b c d | a b c d | a b c d)` The input is extended by
- wrapping around to the opposite edge.
- - *nearest*: `(a a a a | a b c d | d d d d)` The input is extended by the
- nearest pixel.
- interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
- seed: Integer. Used to create a random seed.
- fill_value: a float represents the value to be filled outside the boundaries
- when `fill_mode` is "constant".
-
- Example:
-
- >>> input_img = np.random.random((32, 224, 224, 3))
- >>> layer = tf.keras.layers.experimental.preprocessing.RandomZoom(.5, .2)
- >>> out_img = layer(input_img)
- >>> out_img.shape
- TensorShape([32, 224, 224, 3])
-
- Input shape:
- 4D tensor with shape: `(samples, height, width, channels)`,
- data_format='channels_last'.
- Output shape:
- 4D tensor with shape: `(samples, height, width, channels)`,
- data_format='channels_last'.
- Raise:
- ValueError: if lower bound is not between [0, 1], or upper bound is
- negative.
- """
-
- def __init__(self,
- height_factor,
- width_factor=None,
- fill_mode='reflect',
- interpolation='bilinear',
- seed=None,
- fill_value=0.0,
- **kwargs):
- self.height_factor = height_factor
- if isinstance(height_factor, (tuple, list)):
- self.height_lower = height_factor[0]
- self.height_upper = height_factor[1]
- else:
- self.height_lower = -height_factor
- self.height_upper = height_factor
-
- if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
- raise ValueError('`height_factor` must have values between [-1, 1], '
- 'got {}'.format(height_factor))
-
- self.width_factor = width_factor
- if width_factor is not None:
- if isinstance(width_factor, (tuple, list)):
- self.width_lower = width_factor[0]
- self.width_upper = width_factor[1]
- else:
- self.width_lower = -width_factor # pylint: disable=invalid-unary-operand-type
- self.width_upper = width_factor
-
- if self.width_lower < -1. or self.width_upper < -1.:
- raise ValueError('`width_factor` must have values larger than -1, '
- 'got {}'.format(width_factor))
-
- check_fill_mode_and_interpolation(fill_mode, interpolation)
-
- self.fill_mode = fill_mode
- self.fill_value = fill_value
- self.interpolation = interpolation
- self.seed = seed
- self._rng = make_generator(self.seed)
- self.input_spec = InputSpec(ndim=4)
- super(RandomZoom, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_zoomed_inputs():
- """Zoomed inputs with random ops."""
- inputs_shape = array_ops.shape(inputs)
- batch_size = inputs_shape[0]
- img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
- img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
- height_zoom = self._rng.uniform(
- shape=[batch_size, 1],
- minval=1. + self.height_lower,
- maxval=1. + self.height_upper)
- if self.width_factor is not None:
- width_zoom = self._rng.uniform(
- shape=[batch_size, 1],
- minval=1. + self.width_lower,
- maxval=1. + self.width_upper)
- else:
- width_zoom = height_zoom
- zooms = math_ops.cast(
- array_ops.concat([width_zoom, height_zoom], axis=1),
- dtype=dtypes.float32)
- return transform(
- inputs,
- get_zoom_matrix(zooms, img_hd, img_wd),
- fill_mode=self.fill_mode,
- fill_value=self.fill_value,
- interpolation=self.interpolation)
-
- output = control_flow_util.smart_cond(training, random_zoomed_inputs,
- lambda: inputs)
- output.set_shape(inputs.shape)
- return output
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def get_config(self):
- config = {
- 'height_factor': self.height_factor,
- 'width_factor': self.width_factor,
- 'fill_mode': self.fill_mode,
- 'fill_value': self.fill_value,
- 'interpolation': self.interpolation,
- 'seed': self.seed,
- }
- base_config = super(RandomZoom, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-def get_zoom_matrix(zooms, image_height, image_width, name=None):
- """Returns projective transform(s) for the given zoom(s).
-
- Args:
- zooms: A matrix of 2-element lists representing [zx, zy] to zoom for each
- image (for a batch of images).
- image_height: Height of the image(s) to be transformed.
- image_width: Width of the image(s) to be transformed.
- name: The name of the op.
-
- Returns:
- A tensor of shape (num_images, 8). Projective transforms which can be given
- to operation `image_projective_transform_v2`. If one row of transforms is
- [a0, a1, a2, b0, b1, b2, c0, c1], then it maps the *output* point
- `(x, y)` to a transformed *input* point
- `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
- where `k = c0 x + c1 y + 1`.
- """
- with backend.name_scope(name or 'zoom_matrix'):
- num_zooms = array_ops.shape(zooms)[0]
- # The zoom matrix looks like:
- # [[zx 0 0]
- # [0 zy 0]
- # [0 0 1]]
- # where the last entry is implicit.
- # Zoom matrices are always float32.
- x_offset = ((image_width - 1.) / 2.0) * (1.0 - zooms[:, 0, None])
- y_offset = ((image_height - 1.) / 2.0) * (1.0 - zooms[:, 1, None])
- return array_ops.concat(
- values=[
- zooms[:, 0, None],
- array_ops.zeros((num_zooms, 1), dtypes.float32),
- x_offset,
- array_ops.zeros((num_zooms, 1), dtypes.float32),
- zooms[:, 1, None],
- y_offset,
- array_ops.zeros((num_zooms, 2), dtypes.float32),
- ],
- axis=1)
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomContrast')
-class RandomContrast(base_layer.Layer):
- """Adjust the contrast of an image or images by a random factor.
-
- Contrast is adjusted independently for each channel of each image during
- training.
-
- For each channel, this layer computes the mean of the image pixels in the
- channel and then adjusts each component `x` of each pixel to
- `(x - mean) * contrast_factor + mean`.
-
- Input shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Output shape:
- 4D tensor with shape:
- `(samples, height, width, channels)`, data_format='channels_last'.
-
- Attributes:
- factor: a positive float represented as fraction of value, or a tuple of
- size 2 representing lower and upper bound. When represented as a single
- float, lower = upper. The contrast factor will be randomly picked between
- [1.0 - lower, 1.0 + upper].
- seed: Integer. Used to create a random seed.
- Raise:
- ValueError: if lower bound is not between [0, 1], or upper bound is
- negative.
- """
-
- def __init__(self, factor, seed=None, **kwargs):
- self.factor = factor
- if isinstance(factor, (tuple, list)):
- self.lower = factor[0]
- self.upper = factor[1]
- else:
- self.lower = self.upper = factor
- if self.lower < 0. or self.upper < 0. or self.lower > 1.:
- raise ValueError('Factor cannot have negative values or greater than 1.0,'
- ' got {}'.format(factor))
- self.seed = seed
- self._rng = make_generator(self.seed)
- self.input_spec = InputSpec(ndim=4)
- super(RandomContrast, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_contrasted_inputs():
- return image_ops.stateless_random_contrast(inputs, 1. - self.lower,
- 1. + self.upper,
- self._rng.make_seeds()[:, 0])
-
- output = control_flow_util.smart_cond(training, random_contrasted_inputs,
- lambda: inputs)
- output.set_shape(inputs.shape)
- return output
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def get_config(self):
- config = {
- 'factor': self.factor,
- 'seed': self.seed,
- }
- base_config = super(RandomContrast, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomHeight')
-class RandomHeight(base_layer.Layer):
- """Randomly vary the height of a batch of images during training.
-
- Adjusts the height of a batch of images by a random factor. The input
- should be a 4-D tensor in the "channels_last" image data format.
-
- By default, this layer is inactive during inference.
-
- Args:
- factor: A positive float (fraction of original height), or a tuple of size 2
- representing lower and upper bound for resizing vertically. When
- represented as a single float, this value is used for both the upper and
- lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
- height changed by a random amount in the range `[20%, 30%]`.
- `factor=(-0.2, 0.3)` results in an output with height changed by a random
- amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
- height changed by a random amount in the range `[-20%, +20%]`.
- interpolation: String, the interpolation method. Defaults to `bilinear`.
- Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
- `gaussian`, `mitchellcubic`
- seed: Integer. Used to create a random seed.
- Input shape:
- 4D tensor with shape: `(samples, height, width, channels)`
- (data_format='channels_last').
- Output shape:
- 4D tensor with shape: `(samples, random_height, width, channels)`.
- """
-
- def __init__(self,
- factor,
- interpolation='bilinear',
- seed=None,
- **kwargs):
- self.factor = factor
- if isinstance(factor, (tuple, list)):
- self.height_lower = factor[0]
- self.height_upper = factor[1]
- else:
- self.height_lower = -factor
- self.height_upper = factor
-
- if self.height_upper < self.height_lower:
- raise ValueError('`factor` cannot have upper bound less than '
- 'lower bound, got {}'.format(factor))
- if self.height_lower < -1. or self.height_upper < -1.:
- raise ValueError('`factor` must have values larger than -1, '
- 'got {}'.format(factor))
- self.interpolation = interpolation
- self._interpolation_method = get_interpolation(interpolation)
- self.input_spec = InputSpec(ndim=4)
- self.seed = seed
- self._rng = make_generator(self.seed)
- super(RandomHeight, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_height_inputs():
- """Inputs height-adjusted with random ops."""
- inputs_shape = array_ops.shape(inputs)
- img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
- img_wd = inputs_shape[W_AXIS]
- height_factor = self._rng.uniform(
- shape=[],
- minval=(1.0 + self.height_lower),
- maxval=(1.0 + self.height_upper))
- adjusted_height = math_ops.cast(height_factor * img_hd, dtypes.int32)
- adjusted_size = array_ops.stack([adjusted_height, img_wd])
- output = image_ops.resize_images_v2(
- images=inputs, size=adjusted_size, method=self._interpolation_method)
- original_shape = inputs.shape.as_list()
- output_shape = [original_shape[0]] + [None] + original_shape[2:4]
- output.set_shape(output_shape)
- return output
-
- return control_flow_util.smart_cond(training, random_height_inputs,
- lambda: inputs)
-
- def compute_output_shape(self, input_shape):
- input_shape = tensor_shape.TensorShape(input_shape).as_list()
- return tensor_shape.TensorShape(
- [input_shape[0], None, input_shape[2], input_shape[3]])
-
- def get_config(self):
- config = {
- 'factor': self.factor,
- 'interpolation': self.interpolation,
- 'seed': self.seed,
- }
- base_config = super(RandomHeight, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-@keras_export('keras.layers.experimental.preprocessing.RandomWidth')
-class RandomWidth(base_layer.Layer):
- """Randomly vary the width of a batch of images during training.
-
- Adjusts the width of a batch of images by a random factor. The input
- should be a 4-D tensor in the "channels_last" image data format.
-
- By default, this layer is inactive during inference.
-
- Args:
- factor: A positive float (fraction of original height), or a tuple of size 2
- representing lower and upper bound for resizing vertically. When
- represented as a single float, this value is used for both the upper and
- lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
- width changed by a random amount in the range `[20%, 30%]`. `factor=(-0.2,
- 0.3)` results in an output with width changed by a random amount in the
- range `[-20%, +30%]`. `factor=0.2` results in an output with width changed
- by a random amount in the range `[-20%, +20%]`.
- interpolation: String, the interpolation method. Defaults to `bilinear`.
- Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
- `gaussian`, `mitchellcubic`
- seed: Integer. Used to create a random seed.
- Input shape:
- 4D tensor with shape: `(samples, height, width, channels)`
- (data_format='channels_last').
- Output shape:
- 4D tensor with shape: `(samples, height, random_width, channels)`.
- """
-
- def __init__(self,
- factor,
- interpolation='bilinear',
- seed=None,
- **kwargs):
- self.factor = factor
- if isinstance(factor, (tuple, list)):
- self.width_lower = factor[0]
- self.width_upper = factor[1]
- else:
- self.width_lower = -factor
- self.width_upper = factor
- if self.width_upper < self.width_lower:
- raise ValueError('`factor` cannot have upper bound less than '
- 'lower bound, got {}'.format(factor))
- if self.width_lower < -1. or self.width_upper < -1.:
- raise ValueError('`factor` must have values larger than -1, '
- 'got {}'.format(factor))
- self.interpolation = interpolation
- self._interpolation_method = get_interpolation(interpolation)
- self.input_spec = InputSpec(ndim=4)
- self.seed = seed
- self._rng = make_generator(self.seed)
- super(RandomWidth, self).__init__(**kwargs)
-
- def call(self, inputs, training=True):
- if training is None:
- training = backend.learning_phase()
-
- def random_width_inputs():
- """Inputs width-adjusted with random ops."""
- inputs_shape = array_ops.shape(inputs)
- img_hd = inputs_shape[H_AXIS]
- img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
- width_factor = self._rng.uniform(
- shape=[],
- minval=(1.0 + self.width_lower),
- maxval=(1.0 + self.width_upper))
- adjusted_width = math_ops.cast(width_factor * img_wd, dtypes.int32)
- adjusted_size = array_ops.stack([img_hd, adjusted_width])
- output = image_ops.resize_images_v2(
- images=inputs, size=adjusted_size, method=self._interpolation_method)
- original_shape = inputs.shape.as_list()
- output_shape = original_shape[0:2] + [None] + [original_shape[3]]
- output.set_shape(output_shape)
- return output
-
- return control_flow_util.smart_cond(training, random_width_inputs,
- lambda: inputs)
-
- def compute_output_shape(self, input_shape):
- input_shape = tensor_shape.TensorShape(input_shape).as_list()
- return tensor_shape.TensorShape(
- [input_shape[0], input_shape[1], None, input_shape[3]])
-
- def get_config(self):
- config = {
- 'factor': self.factor,
- 'interpolation': self.interpolation,
- 'seed': self.seed,
- }
- base_config = super(RandomWidth, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
-
-def make_generator(seed=None):
- """Creates a random generator.
-
- Args:
- seed: the seed to initialize the generator. If None, the generator will be
- initialized non-deterministically.
-
- Returns:
- A generator object.
- """
- if seed is not None:
- return stateful_random_ops.Generator.from_seed(seed)
- else:
- return stateful_random_ops.Generator.from_non_deterministic_state()
-
-
-def get_interpolation(interpolation):
- interpolation = interpolation.lower()
- if interpolation not in _RESIZE_METHODS:
- raise NotImplementedError(
- 'Value not recognized for `interpolation`: {}. Supported values '
- 'are: {}'.format(interpolation, _RESIZE_METHODS.keys()))
- return _RESIZE_METHODS[interpolation]
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
deleted file mode 100644
index aa57a8f..0000000
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.image_preprocessing."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import strategy_combinations
-from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-
-
-@ds_combinations.generate(
- combinations.combine(
- strategy=strategy_combinations.all_strategies +
- strategy_combinations.multi_worker_mirrored_strategies,
- mode=["eager", "graph"]))
-class ImagePreprocessingDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_distribution(self, strategy):
- if "CentralStorage" in type(strategy).__name__:
- self.skipTest("Does not work with CentralStorageStrategy yet.")
- # TODO(b/159738418): large image input causes OOM in ubuntu multi gpu.
- np_images = np.random.random((32, 32, 32, 3)).astype(np.float32)
- image_dataset = dataset_ops.Dataset.from_tensor_slices(np_images).batch(
- 16, drop_remainder=True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(32, 32, 3), dtype=dtypes.float32)
- image_preprocessor = keras.Sequential([
- image_preprocessing.Resizing(height=256, width=256),
- image_preprocessing.RandomCrop(height=224, width=224),
- image_preprocessing.RandomTranslation(.1, .1),
- image_preprocessing.RandomRotation(.2),
- image_preprocessing.RandomFlip(),
- image_preprocessing.RandomZoom(.2, .2)])
- preprocessed_image = image_preprocessor(input_data)
- flatten_layer = keras.layers.Flatten(data_format="channels_last")
- output = flatten_layer(preprocessed_image)
- cls_layer = keras.layers.Dense(units=1, activation="sigmoid")
- output = cls_layer(output)
- model = keras.Model(inputs=input_data, outputs=output)
- model.compile(loss="binary_crossentropy")
- _ = model.predict(image_dataset)
-
-
-if __name__ == "__main__":
- v2_compat.enable_v2_behavior()
- multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
deleted file mode 100644
index 0f03b69..0000000
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ /dev/null
@@ -1,1493 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for image preprocessing layers."""
-
-import functools
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.compat import compat
-from tensorflow.python.distribute.mirrored_strategy import MirroredStrategy
-from tensorflow.python.framework import errors
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.ops import gen_stateful_random_ops
-from tensorflow.python.ops import gen_stateless_random_ops_v2
-from tensorflow.python.ops import image_ops_impl as image_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import stateless_random_ops
-from tensorflow.python.platform import test
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class ResizingTest(keras_parameterized.TestCase):
-
- def _run_test(self, kwargs, expected_height, expected_width):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- kwargs.update({'height': expected_height, 'width': expected_width})
- with testing_utils.use_gpu():
- testing_utils.layer_test(
- image_preprocessing.Resizing,
- kwargs=kwargs,
- input_shape=(num_samples, orig_height, orig_width, channels),
- expected_output_shape=(None, expected_height, expected_width,
- channels))
-
- @parameterized.named_parameters(('down_sample_bilinear_2_by_2', {
- 'interpolation': 'bilinear'
- }, 2, 2), ('down_sample_bilinear_3_by_2', {
- 'interpolation': 'bilinear'
- }, 3, 2), ('down_sample_nearest_2_by_2', {
- 'interpolation': 'nearest'
- }, 2, 2), ('down_sample_nearest_3_by_2', {
- 'interpolation': 'nearest'
- }, 3, 2), ('down_sample_area_2_by_2', {
- 'interpolation': 'area'
- }, 2, 2), ('down_sample_area_3_by_2', {
- 'interpolation': 'area'
- }, 3, 2), ('down_sample_crop_to_aspect_ratio_3_by_2', {
- 'interpolation': 'bilinear',
- 'crop_to_aspect_ratio': True,
- }, 3, 2))
- def test_down_sampling(self, kwargs, expected_height, expected_width):
- with CustomObjectScope({'Resizing': image_preprocessing.Resizing}):
- self._run_test(kwargs, expected_height, expected_width)
-
- @parameterized.named_parameters(('up_sample_bilinear_10_by_12', {
- 'interpolation': 'bilinear'
- }, 10, 12), ('up_sample_bilinear_12_by_12', {
- 'interpolation': 'bilinear'
- }, 12, 12), ('up_sample_nearest_10_by_12', {
- 'interpolation': 'nearest'
- }, 10, 12), ('up_sample_nearest_12_by_12', {
- 'interpolation': 'nearest'
- }, 12, 12), ('up_sample_area_10_by_12', {
- 'interpolation': 'area'
- }, 10, 12), ('up_sample_area_12_by_12', {
- 'interpolation': 'area'
- }, 12, 12), ('up_sample_crop_to_aspect_ratio_12_by_14', {
- 'interpolation': 'bilinear',
- 'crop_to_aspect_ratio': True,
- }, 12, 14))
- def test_up_sampling(self, kwargs, expected_height, expected_width):
- with CustomObjectScope({'Resizing': image_preprocessing.Resizing}):
- self._run_test(kwargs, expected_height, expected_width)
-
- def test_down_sampling_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(dtype)
- layer = image_preprocessing.Resizing(
- height=2, width=2, interpolation='nearest')
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [5, 7],
- [13, 15]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 2, 2, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_up_sampling_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(dtype)
- layer = image_preprocessing.Resizing(
- height=4, width=4, interpolation='nearest')
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [0, 0, 1, 1],
- [0, 0, 1, 1],
- [2, 2, 3, 3],
- [2, 2, 3, 3]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 4, 4, 1))
- self.assertAllEqual(expected_output, output_image)
-
- @parameterized.named_parameters(('reshape_bilinear_10_by_4', {
- 'interpolation': 'bilinear'
- }, 10, 4))
- def test_reshaping(self, kwargs, expected_height, expected_width):
- with CustomObjectScope({'Resizing': image_preprocessing.Resizing}):
- self._run_test(kwargs, expected_height, expected_width)
-
- def test_invalid_interpolation(self):
- with self.assertRaises(NotImplementedError):
- image_preprocessing.Resizing(5, 5, 'invalid_interpolation')
-
- def test_config_with_custom_name(self):
- layer = image_preprocessing.Resizing(5, 5, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.Resizing.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
- def test_crop_to_aspect_ratio(self):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype('float32')
- layer = image_preprocessing.Resizing(4, 2, crop_to_aspect_ratio=True)
- output_image = layer(input_image)
- expected_output = np.asarray([
- [1, 2],
- [5, 6],
- [9, 10],
- [13, 14]
- ]).astype('float32')
- expected_output = np.reshape(expected_output, (1, 4, 2, 1))
- self.assertAllEqual(expected_output, output_image)
-
-
-def get_numpy_center_crop(images, expected_height, expected_width):
- orig_height = images.shape[1]
- orig_width = images.shape[2]
- height_start = int((orig_height - expected_height) / 2)
- width_start = int((orig_width - expected_width) / 2)
- height_end = height_start + expected_height
- width_end = width_start + expected_width
- return images[:, height_start:height_end, width_start:width_end, :]
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CenterCropTest(keras_parameterized.TestCase):
-
- def _run_test(self, expected_height, expected_width):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- kwargs = {'height': expected_height, 'width': expected_width}
- input_images = np.random.random(
- (num_samples, orig_height, orig_width, channels)).astype(np.float32)
- expected_output = get_numpy_center_crop(input_images, expected_height,
- expected_width)
- with testing_utils.use_gpu():
- testing_utils.layer_test(
- image_preprocessing.CenterCrop,
- kwargs=kwargs,
- input_shape=(num_samples, orig_height, orig_width, channels),
- input_data=input_images,
- expected_output=expected_output,
- expected_output_shape=(None, expected_height, expected_width,
- channels))
-
- @parameterized.named_parameters(('center_crop_3_by_4', 3, 4),
- ('center_crop_3_by_2', 3, 2))
- def test_center_crop_aligned(self, expected_height, expected_width):
- with CustomObjectScope({'CenterCrop': image_preprocessing.CenterCrop}):
- self._run_test(expected_height, expected_width)
-
- @parameterized.named_parameters(('center_crop_4_by_5', 4, 5),
- ('center_crop_4_by_3', 4, 3))
- def test_center_crop_mis_aligned(self, expected_height, expected_width):
- with CustomObjectScope({'CenterCrop': image_preprocessing.CenterCrop}):
- self._run_test(expected_height, expected_width)
-
- @parameterized.named_parameters(('center_crop_4_by_6', 4, 6),
- ('center_crop_3_by_2', 3, 2))
- def test_center_crop_half_mis_aligned(self, expected_height, expected_width):
- with CustomObjectScope({'CenterCrop': image_preprocessing.CenterCrop}):
- self._run_test(expected_height, expected_width)
-
- @parameterized.named_parameters(('center_crop_5_by_12', 5, 12),
- ('center_crop_10_by_8', 10, 8),
- ('center_crop_10_by_12', 10, 12))
- def test_invalid_center_crop(self, expected_height, expected_width):
- # InternelError is raised by tf.function MLIR lowering pass when TFRT
- # is enabled.
- with self.assertRaisesRegex(
- (errors.InvalidArgumentError, errors.InternalError),
- r'assertion failed|error: \'tf.Slice\' op'):
- self._run_test(expected_height, expected_width)
-
- def test_config_with_custom_name(self):
- layer = image_preprocessing.CenterCrop(5, 5, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.CenterCrop.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomCropTest(keras_parameterized.TestCase):
-
- def _run_test(self, expected_height, expected_width):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- kwargs = {'height': expected_height, 'width': expected_width}
- with testing_utils.use_gpu():
- testing_utils.layer_test(
- image_preprocessing.RandomCrop,
- kwargs=kwargs,
- input_shape=(num_samples, orig_height, orig_width, channels),
- expected_output_shape=(None, expected_height, expected_width,
- channels))
-
- @parameterized.named_parameters(('random_crop_5_by_12', 5, 12),
- ('random_crop_10_by_8', 10, 8),
- ('random_crop_10_by_12', 10, 12))
- def test_invalid_random_crop(self, expected_height, expected_width):
- # InternelError is raised by tf.function MLIR lowering pass when TFRT
- # is enabled.
- with self.assertRaises((errors.InvalidArgumentError, errors.InternalError)):
- with CustomObjectScope({'RandomCrop': image_preprocessing.RandomCrop}):
- self._run_test(expected_height, expected_width)
-
- def test_training_with_mock(self):
- np.random.seed(1337)
- height, width = 3, 4
- height_offset = np.random.randint(low=0, high=3)
- width_offset = np.random.randint(low=0, high=5)
- mock_offset = [0, height_offset, width_offset, 0]
- with test.mock.patch.object(
- stateless_random_ops,
- 'stateless_random_uniform',
- return_value=mock_offset):
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomCrop(height, width)
- inp = np.random.random((12, 5, 8, 3))
- actual_output = layer(inp, training=1)
- expected_output = inp[:, height_offset:(height_offset + height),
- width_offset:(width_offset + width), :]
- self.assertAllClose(expected_output, actual_output)
-
- @parameterized.named_parameters(('random_crop_4_by_6', 4, 6),
- ('random_crop_3_by_2', 3, 2))
- def test_random_crop_output_shape(self, expected_height, expected_width):
- with CustomObjectScope({'RandomCrop': image_preprocessing.RandomCrop}):
- self._run_test(expected_height, expected_width)
-
- def test_random_crop_full_height(self):
- self._run_test(5, 2)
-
- def test_random_crop_full_width(self):
- self._run_test(3, 8)
-
- def test_random_crop_full(self):
- np.random.seed(1337)
- height, width = 8, 16
- inp = np.random.random((12, 8, 16, 3))
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomCrop(height, width)
- actual_output = layer(inp, training=0)
- self.assertAllClose(inp, actual_output)
-
- def test_predicting_with_mock_longer_height(self):
- np.random.seed(1337)
- height, width = 3, 3
- inp = np.random.random((12, 10, 6, 3))
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomCrop(height, width)
- actual_output = layer(inp, training=0)
- resized_inp = image_ops.resize_images_v2(inp, size=[5, 3])
- expected_output = resized_inp[:, 1:4, :, :]
- self.assertAllClose(expected_output, actual_output)
-
- def test_predicting_with_mock_longer_width(self):
- np.random.seed(1337)
- height, width = 4, 6
- inp = np.random.random((12, 8, 16, 3))
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomCrop(height, width)
- actual_output = layer(inp, training=0)
- resized_inp = image_ops.resize_images_v2(inp, size=[4, 8])
- expected_output = resized_inp[:, :, 1:7, :]
- self.assertAllClose(expected_output, actual_output)
-
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomCrop(5, 5, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomCrop.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-class RescalingTest(keras_parameterized.TestCase):
-
- @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
- def test_rescaling_base(self):
- kwargs = {'scale': 1. / 127.5, 'offset': -1.}
- testing_utils.layer_test(
- image_preprocessing.Rescaling,
- kwargs=kwargs,
- input_shape=(2, 5, 6, 3),
- expected_output_shape=(None, 5, 6, 3))
-
- @testing_utils.run_v2_only
- def test_rescaling_correctness_float(self):
- layer = image_preprocessing.Rescaling(scale=1. / 127.5, offset=-1.)
- inputs = random_ops.random_uniform((2, 4, 5, 3))
- outputs = layer(inputs)
- self.assertAllClose(outputs.numpy(), inputs.numpy() * (1. / 127.5) - 1)
-
- @testing_utils.run_v2_only
- def test_rescaling_correctness_int(self):
- layer = image_preprocessing.Rescaling(scale=1. / 127.5, offset=-1)
- inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
- outputs = layer(inputs)
- self.assertEqual(outputs.dtype.name, 'float32')
- self.assertAllClose(outputs.numpy(), inputs.numpy() * (1. / 127.5) - 1)
-
- def test_config_with_custom_name(self):
- layer = image_preprocessing.Rescaling(0.5, name='rescaling')
- config = layer.get_config()
- layer_1 = image_preprocessing.Rescaling.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomFlipTest(keras_parameterized.TestCase):
-
- def _run_test(self, mode, expected_output=None, mock_random=None):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- if mock_random is None:
- mock_random = [1 for _ in range(num_samples)]
- mock_random = np.reshape(mock_random, [2, 1, 1, 1])
- inp = np.random.random((num_samples, orig_height, orig_width, channels))
- if expected_output is None:
- expected_output = inp
- if mode == 'horizontal' or mode == 'horizontal_and_vertical':
- expected_output = np.flip(expected_output, axis=2)
- if mode == 'vertical' or mode == 'horizontal_and_vertical':
- expected_output = np.flip(expected_output, axis=1)
- with test.mock.patch.object(
- stateless_random_ops,
- 'stateless_random_uniform',
- return_value=mock_random,
- ):
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomFlip(mode)
- actual_output = layer(inp, training=1)
- self.assertAllClose(expected_output, actual_output)
-
- @parameterized.named_parameters(
- ('random_flip_horizontal', 'horizontal'),
- ('random_flip_vertical', 'vertical'),
- ('random_flip_both', 'horizontal_and_vertical'))
- def test_random_flip(self, mode):
- with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
- self._run_test(mode)
-
- def test_random_flip_horizontal_half(self):
- with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
- np.random.seed(1337)
- mock_random = [1, 0]
- mock_random = np.reshape(mock_random, [2, 1, 1, 1])
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images.copy()
- expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=1)
- self._run_test('horizontal', expected_output, mock_random)
-
- def test_random_flip_vertical_half(self):
- with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
- np.random.seed(1337)
- mock_random = [1, 0]
- mock_random = np.reshape(mock_random, [2, 1, 1, 1])
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images.copy()
- expected_output[0, :, :, :] = np.flip(input_images[0, :, :, :], axis=0)
- self._run_test('vertical', expected_output, mock_random)
-
- def test_random_flip_inference(self):
- with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomFlip()
- actual_output = layer(input_images, training=0)
- self.assertAllClose(expected_output, actual_output)
-
- def test_random_flip_default(self):
- with CustomObjectScope({'RandomFlip': image_preprocessing.RandomFlip}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = np.flip(np.flip(input_images, axis=1), axis=2)
- mock_random = [1, 1]
- mock_random = np.reshape(mock_random, [2, 1, 1, 1])
- with test.mock.patch.object(
- stateless_random_ops,
- 'stateless_random_uniform',
- return_value=mock_random,
- ):
- with self.cached_session():
- layer = image_preprocessing.RandomFlip()
- actual_output = layer(input_images, training=1)
- self.assertAllClose(expected_output, actual_output)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomFlip(name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomFlip.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomContrastTest(keras_parameterized.TestCase):
-
- def _run_test(self, lower, upper, expected_output=None, mock_random=None):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- if mock_random is None:
- mock_random = 0.2
- inp = np.random.random((num_samples, orig_height, orig_width, channels))
- if expected_output is None:
- # reduce mean on height.
- inp_mean = np.mean(inp, axis=1, keepdims=True)
- # reduce mean on width.
- inp_mean = np.mean(inp_mean, axis=2, keepdims=True)
- expected_output = (inp - inp_mean) * mock_random + inp_mean
- with test.mock.patch.object(
- stateless_random_ops,
- 'stateless_random_uniform',
- return_value=mock_random,
- ):
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomContrast((lower, upper))
- actual_output = layer(inp, training=True)
- self.assertAllClose(expected_output, actual_output)
-
- @parameterized.named_parameters(('random_contrast_2_by_5', 0.2, 0.5),
- ('random_contrast_2_by_13', 0.2, 1.3),
- ('random_contrast_5_by_2', 0.5, 0.2),
- ('random_contrast_10_by_10', 1.0, 1.0))
- def test_random_contrast(self, lower, upper):
- with CustomObjectScope(
- {'RandomContrast': image_preprocessing.RandomContrast}):
- self._run_test(lower, upper)
-
- @parameterized.named_parameters(('random_contrast_amplitude_2', 0.2),
- ('random_contrast_amplitude_5', 0.5))
- def test_random_contrast_amplitude(self, amplitude):
- with CustomObjectScope(
- {'RandomContrast': image_preprocessing.RandomContrast}):
- input_images = np.random.random((2, 5, 8, 3))
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomContrast(amplitude)
- layer(input_images)
-
- def test_random_contrast_inference(self):
- with CustomObjectScope(
- {'RandomContrast': image_preprocessing.RandomContrast}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomContrast((0.1, 0.2))
- actual_output = layer(input_images, training=False)
- self.assertAllClose(expected_output, actual_output)
-
- def test_random_contrast_int_dtype(self):
- with CustomObjectScope(
- {'RandomContrast': image_preprocessing.RandomContrast}):
- input_images = np.random.randint(low=0, high=255, size=(2, 5, 8, 3))
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomContrast((0.1, 0.2))
- layer(input_images)
-
- def test_random_contrast_invalid_bounds(self):
- with self.assertRaises(ValueError):
- image_preprocessing.RandomContrast((-0.1, .5))
-
- with self.assertRaises(ValueError):
- image_preprocessing.RandomContrast((1.1, .5))
-
- with self.assertRaises(ValueError):
- image_preprocessing.RandomContrast((0.1, -0.2))
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomContrast((.5, .6), name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomContrast.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomTranslationTest(keras_parameterized.TestCase):
-
- def _run_test(self, height_factor, width_factor):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- kwargs = {'height_factor': height_factor, 'width_factor': width_factor}
- with testing_utils.use_gpu():
- testing_utils.layer_test(
- image_preprocessing.RandomTranslation,
- kwargs=kwargs,
- input_shape=(num_samples, orig_height, orig_width, channels),
- expected_output_shape=(None, orig_height, orig_width, channels))
-
- @parameterized.named_parameters(
- ('random_translate_4_by_6', .4, .6), ('random_translate_3_by_2', .3, .2),
- ('random_translate_tuple_factor', (-.5, .4), (.2, .3)))
- def test_random_translation(self, height_factor, width_factor):
- self._run_test(height_factor, width_factor)
-
- def test_random_translation_up_numeric_reflect(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
- # Shifting by -.2 * 5 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=(-.2, -.2), width_factor=0.)
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [5, 6, 7, 8, 9],
- [10, 11, 12, 13, 14],
- [15, 16, 17, 18, 19],
- [20, 21, 22, 23, 24],
- [20, 21, 22, 23, 24]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_up_numeric_constant(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
- # Shifting by -.2 * 5 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=(-.2, -.2), width_factor=0., fill_mode='constant')
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [5, 6, 7, 8, 9],
- [10, 11, 12, 13, 14],
- [15, 16, 17, 18, 19],
- [20, 21, 22, 23, 24],
- [0, 0, 0, 0, 0]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_down_numeric_reflect(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
- # Shifting by .2 * 5 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=(.2, .2), width_factor=0.)
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [0, 1, 2, 3, 4],
- [0, 1, 2, 3, 4],
- [5, 6, 7, 8, 9],
- [10, 11, 12, 13, 14],
- [15, 16, 17, 18, 19]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_asymmetric_size_numeric_reflect(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(dtype)
- # Shifting by .5 * 8 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=(.5, .5), width_factor=0.)
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [6, 7],
- [4, 5],
- [2, 3],
- [0, 1],
- [0, 1],
- [2, 3],
- [4, 5],
- [6, 7],
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 8, 2, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_down_numeric_constant(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
- # Shifting by -.2 * 5 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=(.2, .2), width_factor=0., fill_mode='constant')
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [0, 0, 0, 0, 0],
- [0, 1, 2, 3, 4],
- [5, 6, 7, 8, 9],
- [10, 11, 12, 13, 14],
- [15, 16, 17, 18, 19]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_left_numeric_reflect(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
- # Shifting by .2 * 5 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=0., width_factor=(-.2, -.2))
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [1, 2, 3, 4, 4],
- [6, 7, 8, 9, 9],
- [11, 12, 13, 14, 14],
- [16, 17, 18, 19, 19],
- [21, 22, 23, 24, 24]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_left_numeric_constant(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
- # Shifting by -.2 * 5 = 1 pixel.
- layer = image_preprocessing.RandomTranslation(
- height_factor=0., width_factor=(-.2, -.2), fill_mode='constant')
- output_image = layer(input_image)
- # pyformat: disable
- expected_output = np.asarray([
- [1, 2, 3, 4, 0],
- [6, 7, 8, 9, 0],
- [11, 12, 13, 14, 0],
- [16, 17, 18, 19, 0],
- [21, 22, 23, 24, 0]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_translation_inference(self):
- with CustomObjectScope(
- {'RandomTranslation': image_preprocessing.RandomTranslation}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomTranslation(.5, .5)
- actual_output = layer(input_images, training=0)
- self.assertAllClose(expected_output, actual_output)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomTranslation(.5, .6, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomTranslation.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomTransformTest(keras_parameterized.TestCase):
-
- def _run_random_transform_with_mock(self,
- transform_matrix,
- expected_output,
- mode,
- fill_value=0.0,
- interpolation='bilinear'):
- inp = np.arange(15).reshape((1, 5, 3, 1)).astype(np.float32)
- with self.cached_session():
- output = image_preprocessing.transform(
- inp,
- transform_matrix,
- fill_mode=mode,
- fill_value=fill_value,
- interpolation=interpolation)
- self.assertAllClose(expected_output, output)
-
- def test_random_translation_reflect(self):
- # reflected output is (dcba|abcd|dcba)
-
- # Test down shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 1., 2.],
- [0., 1., 2.],
- [3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'reflect')
-
- # Test up shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11.],
- [12., 13., 14.],
- [12., 13., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'reflect')
-
- # Test left shift by 1.
- # reflected output is (dcba|abcd|dcba)
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 2., 2.],
- [4., 5., 5.],
- [7., 8., 8.],
- [10., 11., 11.],
- [13., 14., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'reflect')
-
- # Test right shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 0., 1.],
- [3., 3., 4],
- [6., 6., 7.],
- [9., 9., 10.],
- [12., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'reflect')
-
- def test_random_translation_wrap(self):
- # warpped output is (abcd|abcd|abcd)
-
- # Test down shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[12., 13., 14.],
- [0., 1., 2.],
- [3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'wrap')
-
- # Test up shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11.],
- [12., 13., 14.],
- [0., 1., 2.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'wrap')
-
- # Test left shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 2., 0.],
- [4., 5., 3.],
- [7., 8., 6.],
- [10., 11., 9.],
- [13., 14., 12.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'wrap')
-
- # Test right shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[2., 0., 1.],
- [5., 3., 4],
- [8., 6., 7.],
- [11., 9., 10.],
- [14., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'wrap')
-
- def test_random_translation_nearest(self):
- # nearest output is (aaaa|abcd|dddd)
-
- # Test down shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 1., 2.],
- [0., 1., 2.],
- [3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'nearest')
-
- # Test up shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11.],
- [12., 13., 14.],
- [12., 13., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'nearest')
-
- # Test left shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 2., 2.],
- [4., 5., 5.],
- [7., 8., 8.],
- [10., 11., 11.],
- [13., 14., 14.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'nearest')
-
- # Test right shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 0., 1.],
- [3., 3., 4],
- [6., 6., 7.],
- [9., 9., 10.],
- [12., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'nearest')
-
- def test_random_translation_constant_0(self):
- # constant output is (0000|abcd|0000)
-
- # Test down shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 0., 0.],
- [0., 1., 2.],
- [3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'constant')
-
- # Test up shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11.],
- [12., 13., 14.],
- [0., 0., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'constant')
-
- # Test left shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 2., 0.],
- [4., 5., 0.],
- [7., 8., 0.],
- [10., 11., 0.],
- [13., 14., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'constant')
-
- # Test right shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 0., 1.],
- [0., 3., 4],
- [0., 6., 7.],
- [0., 9., 10.],
- [0., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(transform_matrix, expected_output,
- 'constant')
-
- def test_random_translation_constant_1(self):
- with compat.forward_compatibility_horizon(2020, 8, 6):
- # constant output is (1111|abcd|1111)
-
- # Test down shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 1., 1.],
- [0., 1., 2.],
- [3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix, expected_output, 'constant', fill_value=1.0)
-
- # Test up shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11.],
- [12., 13., 14.],
- [1., 1., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix, expected_output, 'constant', fill_value=1.0)
-
- # Test left shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 2., 1.],
- [4., 5., 1.],
- [7., 8., 1.],
- [10., 11., 1.],
- [13., 14., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix, expected_output, 'constant', fill_value=1.0)
-
- # Test right shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 0., 1.],
- [1., 3., 4],
- [1., 6., 7.],
- [1., 9., 10.],
- [1., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix, expected_output, 'constant', fill_value=1.0)
-
- def test_random_translation_nearest_interpolation(self):
- # nearest output is (aaaa|abcd|dddd)
-
- # Test down shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 0., 0.],
- [0., 1., 2.],
- [3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix,
- expected_output,
- mode='constant',
- interpolation='nearest')
-
- # Test up shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[3., 4., 5.],
- [6., 7., 8],
- [9., 10., 11.],
- [12., 13., 14.],
- [0., 0., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix,
- expected_output,
- mode='constant',
- interpolation='nearest')
-
- # Test left shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[1., 2., 0.],
- [4., 5., 0.],
- [7., 8., 0.],
- [10., 11., 0.],
- [13., 14., 0.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix,
- expected_output,
- mode='constant',
- interpolation='nearest')
-
- # Test right shift by 1.
- # pyformat: disable
- expected_output = np.asarray(
- [[0., 0., 1.],
- [0., 3., 4],
- [0., 6., 7.],
- [0., 9., 10.],
- [0., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
- # pyformat: enable
- transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
- self._run_random_transform_with_mock(
- transform_matrix,
- expected_output,
- mode='constant',
- interpolation='nearest')
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomRotationTest(keras_parameterized.TestCase):
-
- def _run_test(self, factor):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- kwargs = {'factor': factor}
- with testing_utils.use_gpu():
- testing_utils.layer_test(
- image_preprocessing.RandomRotation,
- kwargs=kwargs,
- input_shape=(num_samples, orig_height, orig_width, channels),
- expected_output_shape=(None, orig_height, orig_width, channels))
-
- @parameterized.named_parameters(('random_rotate_4', .4),
- ('random_rotate_3', .3),
- ('random_rotate_tuple_factor', (-.5, .4)))
- def test_random_rotation(self, factor):
- self._run_test(factor)
-
- def test_random_rotation_inference(self):
- with CustomObjectScope(
- {'RandomTranslation': image_preprocessing.RandomRotation}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomRotation(.5)
- actual_output = layer(input_images, training=0)
- self.assertAllClose(expected_output, actual_output)
-
- def test_distribution_strategy(self):
- """Tests that RandomRotation can be created within distribution strategies.
- """
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- with testing_utils.use_gpu():
- strat = MirroredStrategy(devices=['cpu', 'gpu'])
- with strat.scope():
- layer = image_preprocessing.RandomRotation(.5)
- output = strat.run(lambda: layer(input_images, training=True))
- values = output.values
- self.assertAllEqual(2, len(values))
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomRotation(.5, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomRotation.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomZoomTest(keras_parameterized.TestCase):
-
- def _run_test(self, height_factor, width_factor):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- kwargs = {'height_factor': height_factor, 'width_factor': width_factor}
- with testing_utils.use_gpu():
- testing_utils.layer_test(
- image_preprocessing.RandomZoom,
- kwargs=kwargs,
- input_shape=(num_samples, orig_height, orig_width, channels),
- expected_output_shape=(None, orig_height, orig_width, channels))
-
- @parameterized.named_parameters(
- ('random_zoom_4_by_6', -.4, -.6), ('random_zoom_2_by_3', -.2, -.3),
- ('random_zoom_tuple_factor', (-.4, -.5), (-.2, -.3)))
- def test_random_zoom_in(self, height_factor, width_factor):
- self._run_test(height_factor, width_factor)
-
- @parameterized.named_parameters(
- ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
- ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
- def test_random_zoom_out(self, height_factor, width_factor):
- self._run_test(height_factor, width_factor)
-
- def test_random_zoom_in_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
- layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
- interpolation='nearest')
- output_image = layer(np.expand_dims(input_image, axis=0))
- # pyformat: disable
- expected_output = np.asarray([
- [6, 7, 7, 8, 8],
- [11, 12, 12, 13, 13],
- [11, 12, 12, 13, 13],
- [16, 17, 17, 18, 18],
- [16, 17, 17, 18, 18]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_zoom_out_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
- layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
- fill_mode='constant',
- interpolation='nearest')
- output_image = layer(np.expand_dims(input_image, axis=0))
- # pyformat: disable
- expected_output = np.asarray([
- [0, 0, 0, 0, 0],
- [0, 5, 7, 9, 0],
- [0, 10, 12, 14, 0],
- [0, 20, 22, 24, 0],
- [0, 0, 0, 0, 0]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
- layer = image_preprocessing.RandomZoom((.5, .5),
- fill_mode='constant',
- interpolation='nearest')
- output_image = layer(np.expand_dims(input_image, axis=0))
- # pyformat: disable
- expected_output = np.asarray([
- [0, 0, 0, 0, 0],
- [0, 6, 7, 9, 0],
- [0, 11, 12, 14, 0],
- [0, 21, 22, 24, 0],
- [0, 0, 0, 0, 0]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 5, 5, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_zoom_inference(self):
- with CustomObjectScope({'RandomZoom': image_preprocessing.RandomZoom}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomZoom(.5, .5)
- actual_output = layer(input_images, training=0)
- self.assertAllClose(expected_output, actual_output)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomZoom(.5, .6, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomZoom.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomHeightTest(keras_parameterized.TestCase):
-
- def _run_test(self, factor):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- with testing_utils.use_gpu():
- img = np.random.random((num_samples, orig_height, orig_width, channels))
- layer = image_preprocessing.RandomHeight(factor)
- img_out = layer(img, training=True)
- self.assertEqual(img_out.shape[0], 2)
- self.assertEqual(img_out.shape[2], 8)
- self.assertEqual(img_out.shape[3], 3)
-
- @parameterized.named_parameters(('random_height_4_by_6', (.4, .6)),
- ('random_height_3_by_2', (-.3, .2)),
- ('random_height_3', .3))
- def test_random_height_basic(self, factor):
- self._run_test(factor)
-
- def test_valid_random_height(self):
- # need (maxval - minval) * rnd + minval = 0.6
- mock_factor = 0
- with test.mock.patch.object(
- gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
- with test.mock.patch.object(
- gen_stateless_random_ops_v2,
- 'stateless_random_uniform_v2',
- return_value=mock_factor):
- with testing_utils.use_gpu():
- img = np.random.random((12, 5, 8, 3))
- layer = image_preprocessing.RandomHeight(.4)
- img_out = layer(img, training=True)
- self.assertEqual(img_out.shape[1], 3)
-
- def test_random_height_longer_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
- layer = image_preprocessing.RandomHeight(factor=(1., 1.))
- # Return type of RandomHeight() is float32 if `interpolation` is not
- # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
- output_image = math_ops.cast(
- layer(np.expand_dims(input_image, axis=0)), dtype=dtype)
- # pyformat: disable
- expected_output = np.asarray([
- [0, 1, 2],
- [0.75, 1.75, 2.75],
- [2.25, 3.25, 4.25],
- [3, 4, 5]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 4, 3, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_height_shorter_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
- layer = image_preprocessing.RandomHeight(
- factor=(-.5, -.5), interpolation='nearest')
- output_image = layer(np.expand_dims(input_image, axis=0))
- # pyformat: disable
- expected_output = np.asarray([
- [2, 3],
- [6, 7]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 2, 2, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_height_invalid_factor(self):
- with self.assertRaises(ValueError):
- image_preprocessing.RandomHeight((-1.5, .4))
-
- def test_random_height_inference(self):
- with CustomObjectScope({'RandomHeight': image_preprocessing.RandomHeight}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomHeight(.5)
- actual_output = layer(input_images, training=0)
- self.assertAllClose(expected_output, actual_output)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomHeight(.5, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomHeight.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class RandomWidthTest(keras_parameterized.TestCase):
-
- def _run_test(self, factor):
- np.random.seed(1337)
- num_samples = 2
- orig_height = 5
- orig_width = 8
- channels = 3
- with testing_utils.use_gpu():
- img = np.random.random((num_samples, orig_height, orig_width, channels))
- layer = image_preprocessing.RandomWidth(factor)
- img_out = layer(img, training=True)
- self.assertEqual(img_out.shape[0], 2)
- self.assertEqual(img_out.shape[1], 5)
- self.assertEqual(img_out.shape[3], 3)
-
- @parameterized.named_parameters(('random_width_4_by_6', (.4, .6)),
- ('random_width_3_by_2', (-.3, .2)),
- ('random_width_3', .3))
- def test_random_width_basic(self, factor):
- self._run_test(factor)
-
- def test_valid_random_width(self):
- # need (maxval - minval) * rnd + minval = 0.6
- mock_factor = 0
- with test.mock.patch.object(
- gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
- with test.mock.patch.object(
- gen_stateless_random_ops_v2,
- 'stateless_random_uniform_v2',
- return_value=mock_factor):
- with testing_utils.use_gpu():
- img = np.random.random((12, 8, 5, 3))
- layer = image_preprocessing.RandomWidth(.4)
- img_out = layer(img, training=True)
- self.assertEqual(img_out.shape[2], 3)
-
- def test_random_width_longer_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
- layer = image_preprocessing.RandomWidth(factor=(1., 1.))
- # Return type of RandomWidth() is float32 if `interpolation` is not
- # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
- output_image = math_ops.cast(
- layer(np.expand_dims(input_image, axis=0)), dtype=dtype)
- # pyformat: disable
- expected_output = np.asarray([
- [0, 0.25, 0.75, 1],
- [2, 2.25, 2.75, 3],
- [4, 4.25, 4.75, 5]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 3, 4, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_width_shorter_numeric(self):
- for dtype in (np.int64, np.float32):
- with testing_utils.use_gpu():
- input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
- layer = image_preprocessing.RandomWidth(
- factor=(-.5, -.5), interpolation='nearest')
- output_image = layer(np.expand_dims(input_image, axis=0))
- # pyformat: disable
- expected_output = np.asarray([
- [1, 3],
- [5, 7]
- ]).astype(dtype)
- # pyformat: enable
- expected_output = np.reshape(expected_output, (1, 2, 2, 1))
- self.assertAllEqual(expected_output, output_image)
-
- def test_random_width_invalid_factor(self):
- with self.assertRaises(ValueError):
- image_preprocessing.RandomWidth((-1.5, .4))
-
- def test_random_width_inference(self):
- with CustomObjectScope({'RandomWidth': image_preprocessing.RandomWidth}):
- input_images = np.random.random((2, 5, 8, 3)).astype(np.float32)
- expected_output = input_images
- with testing_utils.use_gpu():
- layer = image_preprocessing.RandomWidth(.5)
- actual_output = layer(input_images, training=0)
- self.assertAllClose(expected_output, actual_output)
-
- @testing_utils.run_v2_only
- def test_config_with_custom_name(self):
- layer = image_preprocessing.RandomWidth(.5, name='image_preproc')
- config = layer.get_config()
- layer_1 = image_preprocessing.RandomWidth.from_config(config)
- self.assertEqual(layer_1.name, layer.name)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class LearningPhaseTest(keras_parameterized.TestCase):
-
- def test_plain_call(self):
- layer = image_preprocessing.RandomWidth(.5, seed=123)
- shape = (12, 12, 3)
- img = np.random.random((12,) + shape)
- out = layer(img) # Default to training=True
- self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
- out = layer(img, training=True)
- self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
- out = layer(img, training=False)
- self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
- def test_call_in_container(self):
- layer1 = image_preprocessing.RandomWidth(.5, seed=123)
- layer2 = image_preprocessing.RandomHeight(.5, seed=123)
- seq = sequential.Sequential([layer1, layer2])
-
- shape = (12, 12, 3)
- img = np.random.random((12,) + shape)
- out = seq(img) # Default to training=True
- self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
- out = seq(img, training=True)
- self.assertNotEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
- out = seq(img, training=False)
- self.assertEqual(tuple(int(i) for i in out.shape[1:]), shape)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class DeterminismTest(keras_parameterized.TestCase):
-
- @parameterized.named_parameters(
- ('random_flip', image_preprocessing.RandomFlip),
- ('random_contrast',
- functools.partial(image_preprocessing.RandomContrast, factor=1.)),
- ('random_crop',
- functools.partial(image_preprocessing.RandomCrop, height=2, width=2)),
- ('random_translation',
- functools.partial(image_preprocessing.RandomTranslation, 0.3, 0.2)),
- ('random_rotation',
- functools.partial(image_preprocessing.RandomRotation, 0.5)),
- ('random_zoom', functools.partial(image_preprocessing.RandomZoom, 0.2)),
- ('random_height', functools.partial(image_preprocessing.RandomHeight,
- 0.4)),
- ('random_width', functools.partial(image_preprocessing.RandomWidth, 0.3)),
- )
- def test_seed_constructor_arg(self, layer_cls):
- input_image = np.random.random((2, 5, 8, 3)).astype(np.float32)
-
- layer1 = layer_cls(seed=0.)
- layer2 = layer_cls(seed=0.)
- layer1_output = layer1(input_image)
- layer2_output = layer2(input_image)
-
- self.assertAllClose(layer1_output.numpy().tolist(),
- layer2_output.numpy().tolist())
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
deleted file mode 100644
index 08b14d2..0000000
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ /dev/null
@@ -1,931 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras index lookup preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import collections
-import json
-import operator
-
-import numpy as np
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.keras.layers.preprocessing import table_utils
-from tensorflow.python.keras.saving.saved_model import layer_serialization
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-INT = "int"
-MULTI_HOT = "multi_hot"
-ONE_HOT = "one_hot"
-COUNT = "count"
-TF_IDF = "tf_idf"
-
-_VOCAB_NAME = "vocab"
-_IDF_WEIGHTS_NAME = "idf_weights"
-
-
-class _NullInitializer(lookup_ops.TextFileInitializer):
- """A placeholder initializer for restoring this layer from a SavedModel."""
-
- def __init__(self, key_dtype, value_dtype):
- """Construct a table initializer object.
-
- Args:
- key_dtype: Type of the table keys.
- value_dtype: Type of the table values.
- """
- self._key_dtype = dtypes.as_dtype(key_dtype)
- self._value_dtype = dtypes.as_dtype(value_dtype)
-
- @property
- def key_dtype(self):
- """The expected table key dtype."""
- return self._key_dtype
-
- @property
- def value_dtype(self):
- """The expected table value dtype."""
- return self._value_dtype
-
- def initialize(self, table):
- """Returns the table initialization op."""
- pass
-
- @property
- def _shared_name(self):
- """Returns a shared name to be used by the table."""
- shared_name = "NULL_INITIALIZER_"
- if context.executing_eagerly():
- # Ensure a unique name when eager execution is enabled to avoid spurious
- # sharing issues..
- shared_name += str(backend.get_uid(shared_name))
- return shared_name
-
-
-class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
- """Maps values from a vocabulary to integer indices.
-
- This layer translates a set of arbitrary hashables into an integer output via
- a table-based lookup, with optional out-of-vocabulary handling. This is the
- basis layer for both IntegerLookup and StringLookup; it holds the common
- logic but is not intended to be exported as part of the Keras API.
-
- Args:
- max_tokens: The maximum size of the vocabulary for this layer. If None,
- there is no cap on the size of the vocabulary. Note that this size
- includes the OOV and mask tokens.
- num_oov_indices: The number of out-of-vocabulary tokens to use. If this
- value is more than 1, OOV inputs are hashed to determine their OOV value.
- If this value is 0, OOV inputs will cause an error when calling the layer.
- mask_token: A token that represents masked inputs. When `output_mode` is
- `"int"`, the token is included in vocabulary and mapped to index 0. In
- other output modes, the token will not appear in the vocabulary and
- instances of the mask token in the input will be dropped. If set to None,
- no mask term will be added.
- oov_token: Only used when `invert` is True. The token to return for OOV
- indices.
- vocabulary: An optional list of vocabulary terms. If the list contains the
- same token multiple times, an error will be thrown.
- invert: Only valid when `output_mode` is `"int"`. If True, this layer will
- map indices to vocabulary items instead of mapping vocabulary items to
- indices. Default to False.
- output_mode: Specification for the output of the layer. Defaults to `"int"`.
- Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
- `"tf_idf"` configuring the layer as follows:
- - `"int"`: Return the raw integer indices of the input tokens.
- - `"one_hot"`: Encodes each individual element in the input into an
- array the same size as the vocabulary, containing a 1 at the element
- index. If the last dimension is size 1, will encode on that dimension.
- If the last dimension is not size 1, will append a new dimension for
- the encoded output.
- - `"multi_hot"`: Encodes each sample in the input into a single array
- the same size as the vocabulary, containing a 1 for each vocabulary
- term present in the sample. Treats the last dimension as the sample
- dimension, if input shape is (..., sample_length), output shape will
- be (..., num_tokens).
- - `"count"`: As `"multi_hot"`, but the int array contains a count of the
- number of times the token at that index appeared in the sample.
- - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
- find the value in each token slot.
- pad_to_max_tokens: Only valid when `output_mode` is `"multi_hot"`,
- `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
- padded to `max_tokens` even if the number of unique tokens in the
- vocabulary is less than max_tokens, resulting in a tensor of shape
- [batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
- sparse: Boolean. Only applicable to `"multi_hot"` and `"count"` output
- modes. If True, returns a `SparseTensor` instead of a dense `Tensor`.
- Defaults to False.
- """
-
- def __init__(self,
- max_tokens,
- num_oov_indices,
- mask_token,
- oov_token,
- vocabulary=None,
- invert=False,
- output_mode=INT,
- sparse=False,
- pad_to_max_tokens=False,
- **kwargs):
- # If max_tokens is set, the value must be greater than 1 - otherwise we
- # are creating a 0-element vocab, which doesn't make sense.
- if max_tokens is not None and max_tokens <= 1:
- raise ValueError("If set, `max_tokens` must be greater than 1. "
- "You passed {}".format(max_tokens))
-
- if num_oov_indices < 0:
- raise ValueError("`num_oov_indices` must be greater than or equal to 0. "
- "You passed {}".format(num_oov_indices))
-
- # Support deprecated names for output_modes.
- if output_mode == "binary":
- output_mode = MULTI_HOT
- if output_mode == "tf-idf":
- output_mode = TF_IDF
- # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF)
- layer_utils.validate_string_arg(
- output_mode,
- allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT, TF_IDF),
- layer_name=self.__class__.__name__,
- arg_name="output_mode")
-
- if invert and output_mode != INT:
- raise ValueError("`output_mode` must be {} when `invert` is true. You "
- "passed {}".format(INT, output_mode))
-
- self.invert = invert
- self.max_tokens = max_tokens
- self.num_oov_indices = num_oov_indices
- self.output_mode = output_mode
- self.sparse = sparse
- self.pad_to_max_tokens = pad_to_max_tokens
- self._called = False
-
- # A note on vocab_size: we need to always keep a non-Tensor representation
- # of vocab_size around to use in graph building. Because we might be
- # in a tf.function, we can't rely on evaluating the actual tables to
- # find the value either.
- self._vocab_size = None
- # We need to keep track our current vocab size outside of our layer weights
- # to support a static output shape when `output_mode != INT`. The bincount
- # ops do not set shape on their outputs, which means we have to set it
- # ourselves. We persist the current vocab size as a hidden part of the
- # config when serializing our model.
- if "vocabulary_size" in kwargs:
- self._vocab_size = kwargs["vocabulary_size"]
- del kwargs["vocabulary_size"]
-
- restore_from_static_table = kwargs.pop("has_static_table", False)
-
- # Make sure the mask token and oov token are truly of the dtype we want. We
- # can ignore strings here, because they have only one dtype.
- dtype = kwargs["dtype"]
- if dtype == dtypes.int32:
- mask_token = None if mask_token is None else np.int32(mask_token)
- oov_token = None if oov_token is None else np.int32(oov_token)
- elif dtype == dtypes.int64:
- mask_token = None if mask_token is None else np.int64(mask_token)
- oov_token = None if oov_token is None else np.int64(oov_token)
- self.mask_token = mask_token
- self.oov_token = oov_token
-
- if max_tokens is not None:
- available_vocab_size = max_tokens - self._token_start_index()
- else:
- available_vocab_size = None
-
- super(IndexLookup, self).__init__(
- combiner=_IndexLookupCombiner(
- vocab_size=available_vocab_size,
- mask_value=mask_token,
- oov_value=oov_token,
- compute_idf=(output_mode == TF_IDF)),
- **kwargs)
-
- # We need to save the key dtype so that we know if we're expecting int64
- # keys. If we are, we will cast int32 inputs to int64 as well.
- if invert:
- self._key_dtype = dtypes.int64
- self._value_dtype = self.dtype
- self._mask_key = 0
- self._mask_value = mask_token
- key_index = lookup_ops.TextFileIndex.LINE_NUMBER
- value_index = lookup_ops.TextFileIndex.WHOLE_LINE
- default_value = self.oov_token
- oov_indices = None
- else:
- self._key_dtype = self.dtype
- self._value_dtype = dtypes.int64
- self._mask_key = mask_token
- key_index = lookup_ops.TextFileIndex.WHOLE_LINE
- value_index = lookup_ops.TextFileIndex.LINE_NUMBER
- # Masks should map to 0 for int output and be dropped otherwise. Max ints
- # will be dropped from the bincount op.
- self._mask_value = 0 if self.output_mode == INT else dtypes.int64.max
- oov_start = self._oov_start_index()
- token_start = self._token_start_index()
- if self.num_oov_indices == 0:
- # If there are no OOV indices, we map OOV tokens to -1 and error out
- # during call if we find a negative index.
- default_value = -1
- oov_indices = None
- elif self.num_oov_indices == 1:
- # If there is only one OOV index, we can set that index as the default
- # value of the index_lookup table.
- default_value = oov_start
- oov_indices = None
- else:
- # If we hav multiple OOV values, we need to do a further hashing step;
- # to make this easier, we set the OOV value to -1. (This lets us do a
- # vectorized add and cast to boolean to determine locations where we
- # need to do extra hashing.)
- default_value = -1
- oov_indices = list(range(oov_start, token_start))
-
- self._static_vocabulary_path = None
- has_vocab_path = (vocabulary is not None and isinstance(vocabulary, str))
- if has_vocab_path or restore_from_static_table:
- self._has_static_table = True
- if vocabulary is None:
- # If we're restoring a layer that was saved with a static table
- # initializer, we create a fake initializer object to let the code
- # progress. The savedmodel restoration code will handle restoring
- # the actual data.
- initializer = _NullInitializer(self._key_dtype, self._value_dtype)
- else:
- if not gfile.Exists(vocabulary):
- raise ValueError("Vocabulary file %s does not exist." % (vocabulary,))
- self._static_vocabulary_path = vocabulary
- num_tokens = table_utils.num_tokens_in_file(vocabulary)
- self._vocab_size = self._token_start_index() + num_tokens
-
- initializer = lookup_ops.TextFileInitializer(
- filename=vocabulary,
- key_dtype=self._key_dtype,
- key_index=key_index,
- value_dtype=self._value_dtype,
- value_index=value_index,
- value_index_offset=self._token_start_index())
-
- self._table = lookup_ops.StaticHashTable(
- initializer, default_value=default_value)
- self._table_handler = table_utils.TableHandler(
- table=self._table,
- mask_token=self._mask_key if self.mask_token is not None else None,
- mask_value=self._mask_value,
- oov_tokens=oov_indices)
-
- tracked_table = self._add_trackable(self._table, trainable=False)
-
- else:
- self._has_static_table = False
- self._table = lookup_ops.MutableHashTable(
- key_dtype=self._key_dtype,
- value_dtype=self._value_dtype,
- default_value=default_value,
- name=(self._name + "_index_table"))
- self._table_handler = table_utils.TableHandler(
- table=self._table,
- oov_tokens=oov_indices)
- if vocabulary is not None:
- self.set_vocabulary(vocabulary)
- tracked_table = self._add_trackable(self._table, trainable=False)
-
- if self.output_mode == TF_IDF:
- # The TF-IDF weight may have a (None,) tensorshape. This creates
- # a 1D variable with arbitrary shape, which we can assign any weight to
- # so long as it has 1 dimension. In order to properly initialize this
- # weight in Keras, we need to provide a custom callable initializer which
- # does not depend on the shape of the weight (as all other initializers
- # do) since the weight is not known. Hence the lambda shape, dtype: [0].
- if not self.pad_to_max_tokens or max_tokens is None:
- initializer = lambda shape, dtype: [0]
- else:
- initializer = init_ops.zeros_initializer
-
- # We are adding these here instead of in build() since they do not depend
- # on the input shape at all.
- idf_shape = (max_tokens,) if self.pad_to_max_tokens else (None,)
- self.tf_idf_weights = self._add_state_variable(
- name="idf",
- shape=tensor_shape.TensorShape(idf_shape),
- dtype=backend.floatx(),
- initializer=initializer)
-
- # This is a workaround for summary() on this layer. Because the table is
- # not mutable during training, the effective number of parameters (and so
- # the weight shape) is 0; we add this as an attr so that the parameter
- # counting code in the Model object doesn't throw an attribute error.
- tracked_table.shape = tensor_shape.TensorShape((0,))
-
- def compute_output_shape(self, input_shape):
- if self.output_mode == INT:
- return input_shape
- if self._vocab_size and not self.pad_to_max_tokens:
- out_depth = self._vocab_size
- else:
- out_depth = self.max_tokens
- return tensor_shape.TensorShape([input_shape[0], out_depth])
-
- def compute_output_signature(self, input_spec):
- output_shape = self.compute_output_shape(input_spec.shape.as_list())
- output_dtype = (self._value_dtype if self.output_mode == INT
- else backend.floatx())
- return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
-
- def adapt(self, data, reset_state=True):
- """Fits the state of the preprocessing layer to the dataset.
-
- Overrides the default adapt method to apply relevant preprocessing to the
- inputs before passing to the combiner.
-
- Args:
- data: The data to train on. It can be passed either as a tf.data Dataset,
- or as a numpy array.
- reset_state: Optional argument specifying whether to clear the state of
- the layer at the start of the call to `adapt`. This must be True for
- this layer, which does not support repeated calls to `adapt`.
- """
- if not reset_state:
- raise ValueError("IndexLookup does not support streaming adapts.")
- super(IndexLookup, self).adapt(data, reset_state)
-
- def get_vocabulary(self, include_special_tokens=True):
- """Returns the current vocabulary of the layer.
-
- Args:
- include_special_tokens: If True, the returned vocabulary will include mask
- and OOV tokens, and a term's index in the vocabulary will equal the
- term's index when calling the layer. If False, the returned vocabulary
- will not include any mask or OOV tokens.
- """
- if self.vocabulary_size() is None:
- return []
-
- # The MutableHashTable data will not be sorted, so we will create a inverted
- # lookup here, and use that to lookup a range of indices [0, vocab_size).
- keys, values = self._table.export()
- vocab, indices = (values, keys) if self.invert else (keys, values)
- lookup = collections.defaultdict(
- lambda: self.oov_token,
- zip(indices.numpy(), self._tensor_vocab_to_numpy(vocab)))
- vocab = [lookup[x] for x in range(self.vocabulary_size())]
- if self.mask_token is not None and self.output_mode == INT:
- vocab[0] = self.mask_token
- if not include_special_tokens:
- vocab = vocab[self._token_start_index():]
- return vocab
-
- def vocabulary_size(self):
- """Gets the current size of the layer's vocabulary.
-
- Returns:
- The integer size of the voculary, including optional mask and oov indices.
- """
- return self._vocab_size
-
- def vocab_size(self):
- logging.warning("vocab_size is deprecated, please use vocabulary_size.")
- return self.vocabulary_size()
-
- def get_config(self):
- if self._has_static_table:
- vocabulary_path = self._static_vocabulary_path
- else:
- vocabulary_path = None
-
- config = {
- "invert": self.invert,
- "max_tokens": self.max_tokens,
- "num_oov_indices": self.num_oov_indices,
- "oov_token": self.oov_token,
- "mask_token": self.mask_token,
- "output_mode": self.output_mode,
- "pad_to_max_tokens": self.pad_to_max_tokens,
- "vocabulary_size": self.vocabulary_size(),
- "vocabulary": vocabulary_path,
- }
- if self._has_static_table:
- config["has_static_table"] = True
-
- base_config = super(IndexLookup, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
- def count_params(self):
- # This method counts the number of scalars in the weights of this layer.
- # Since this layer doesn't have any /actual/ weights (in that there's
- # nothing in this layer that can be trained - we only use the weight
- # abstraction for ease of saving!) we return 0.
- return 0
-
- def set_vocabulary(self, vocabulary, idf_weights=None):
- """Sets vocabulary (and optionally document frequency) data for this layer.
-
- This method sets the vocabulary and idf weights for this layer directly,
- instead of analyzing a dataset through `adapt`. It should be used whenever
- the vocab (and optionally document frequency) information is already known.
- If vocabulary data is already present in the layer, this method will replace
- it.
-
- Args:
- vocabulary: An array, numpy array, or tensor of hashable tokens.
- idf_weights: An array, numpy array, or tensor of inverse document
- frequency weights with equal length to vocab. Only necessary if the
- layer output_mode is TF_IDF.
-
- Raises:
- ValueError: If there are too many inputs, the inputs do not match, or
- input data is missing.
- RuntimeError: If the vocabulary cannot be set when this function is
- called. This happens when `"multi_hot"`, `"count"`, and `"tfidf"` modes,
- if `pad_to_max_tokens` is False and the layer itself has already been
- called.
- RuntimeError: If a tensor vocabulary is passed outside of eager execution.
- """
- if self._has_static_table:
- raise RuntimeError("Layer {} was created with a static file-based table "
- "because a file path was passed to the layer "
- "init. Layers created with static file-based tables "
- "do not support changing the vocabulary after "
- "creation.".format(self.name))
-
- if self.output_mode != TF_IDF and idf_weights is not None:
- raise ValueError("`idf_weights` should only be set if output_mode is "
- "TF_IDF. output_mode is {}.".format(self.output_mode))
-
- if (self.output_mode in [MULTI_HOT, COUNT, TF_IDF] and self._called and
- not self.pad_to_max_tokens):
- raise RuntimeError("When using {} mode and `pad_to_max_tokens` is "
- "False, the vocabulary cannot be changed after the "
- "layer is called.".format(self.output_mode))
-
- if not context.executing_eagerly() and (tensor_util.is_tensor(vocabulary) or
- tensor_util.is_tensor(idf_weights)):
- raise RuntimeError(
- "Cannot set a tensor vocabulary on {} layer {} when not executing "
- "eagerly. Create this layer or call `set_vocabulary` outside of "
- "any `tf.function`s and with eager execution enabled.".format(
- self.__class__.__name__, self.name))
-
- # TODO(mattdangerw): for better performance we should rewrite this entire
- # function to operate on tensors and convert vocabulary to a tensor here.
- if tensor_util.is_tensor(vocabulary):
- vocabulary = self._tensor_vocab_to_numpy(vocabulary)
- if tensor_util.is_tensor(idf_weights):
- idf_weights = idf_weights.numpy()
-
- oov_start = self._oov_start_index()
- token_start = self._token_start_index()
- should_have_mask = (oov_start > 0)
- has_mask = should_have_mask and vocabulary[0] == self.mask_token
-
- should_have_oov = (self.num_oov_indices > 0)
- expected_oov = [self.oov_token] * self.num_oov_indices
- found_oov = vocabulary[oov_start:token_start]
- has_oov = should_have_oov and found_oov == expected_oov
- # If we get a numpy array, then has_oov may end up being a numpy array
- # instead of a bool. Fix this by collapsing the variable if it's not bool.
- if not isinstance(has_oov, bool):
- has_oov = any(has_oov)
-
- if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
- raise ValueError(
- "Invalid vocabulary format. The layer was created with "
- "`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
- "included in the provided vocabulary. The passed vocabulary has the "
- "correct mask token `{mask}` at index 0, but does not have the OOV "
- "token `{oov}` in indices [{start}:{end}]. Instead, we found "
- "`{found}`. Was this vocabulary generated by a layer with "
- "incompatible settings?".format(
- mask=self.mask_token,
- oov=self.oov_token,
- start=oov_start,
- end=token_start,
- found=found_oov))
-
- if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
- raise ValueError(
- "Invalid vocabulary format. The layer was created with "
- "`mask_token={mask}` and `oov_token={oov}`. These tokens should be "
- "included in the provided vocabulary. The passed vocabulary has the "
- "correct OOV token `{oov}` at indices [{start}:{end}], but does not "
- "have the mask token `{mask}` in index 0. Instead, we found "
- "`{found}`. Was this vocabulary generated by a layer with "
- "incompatible settings?".format(
- mask=self.mask_token,
- oov=self.oov_token,
- start=oov_start,
- end=token_start,
- found=vocabulary[0]))
-
- found_special_tokens = has_oov or has_mask
- if found_special_tokens:
- tokens = vocabulary[token_start:]
- else:
- tokens = vocabulary
-
- repeated_tokens = table_utils.find_repeated_tokens(tokens)
- if repeated_tokens:
- raise ValueError("The passed vocabulary has at least one repeated "
- "term. Please uniquify your dataset. The repeated terms "
- "are {}".format(repeated_tokens))
-
- if self.mask_token in tokens:
- raise ValueError("Reserved mask token {} was found in the passed "
- "vocabulary at index {}. Please either remove the "
- "reserved token from the vocabulary or change the "
- "mask token for this layer.".format(
- self.mask_token, tokens.index(self.mask_token)))
- if self.oov_token in tokens:
- raise ValueError("Reserved OOV token {} was found in the passed "
- "vocabulary at index {}. Please either remove the "
- "reserved token from the vocabulary or change the "
- "OOV token for this layer.".format(
- self.oov_token, tokens.index(self.oov_token)))
-
- self._vocab_size = token_start + len(tokens)
- if self.max_tokens is not None and self._vocab_size > self.max_tokens:
- raise ValueError(
- "Attempted to set a vocabulary larger than the maximum vocab size. "
- "Passed vocab size is {}, max vocab size is {}.".format(
- self._vocab_size, self.max_tokens))
-
- if self.output_mode == TF_IDF:
- if idf_weights is None:
- raise ValueError("`idf_weights` must be set if output_mode is TF_IDF")
- if len(vocabulary) != len(idf_weights):
- raise ValueError("`idf_weights` must be the same length as vocabulary. "
- "len(idf_weights) is {}, len(vocabulary) is {}".format(
- len(vocabulary), len(idf_weights)))
- idf_weights = self._convert_to_ndarray(idf_weights)
- if idf_weights.ndim != 1:
- raise ValueError(
- "TF-IDF data must be a 1-index array, but received {}".format(
- type(idf_weights)))
-
- # We add the non-special vocab tokens and optionally the mask_token to our
- # hash table. OOV tokens are handled with the hash table default value and
- # not added directly.
- self._table_handler.clear()
- indices = np.arange(token_start, len(tokens) + token_start, dtype=np.int64)
- if self.invert:
- self._table_handler.insert(indices, tokens)
- else:
- self._table_handler.insert(tokens, indices)
- if self.mask_token is not None:
- self._table_handler.insert([self._mask_key], [self._mask_value])
-
- if self.output_mode == TF_IDF:
- # If the passed vocabulary has no special tokens, we need to pad the front
- # of idf_weights. We don't have real document frequencies for these tokens
- # so we will use an average of all idf_weights passed in as a reasonable
- # default.
- if found_special_tokens:
- front_padding = 0
- front_padding_value = 0
- else:
- front_padding = token_start
- front_padding_value = np.average(idf_weights)
- # If pad_to_max_tokens is true, and max_tokens is greater than our total
- # vocab size, we need to pad the back of idf_weights with zeros as well.
- back_padding_value = 0
- if self.pad_to_max_tokens and self.max_tokens is not None:
- back_padding = self.max_tokens - front_padding - len(idf_weights)
- else:
- back_padding = 0
- idf_weights = np.pad(
- idf_weights, (front_padding, back_padding),
- "constant",
- constant_values=(front_padding_value, back_padding_value))
- backend.set_value(self.tf_idf_weights, idf_weights)
-
- def _set_state_variables(self, updates):
- if not self.built:
- raise RuntimeError("_set_state_variables() must be called after build().")
- self.set_vocabulary(
- updates[_VOCAB_NAME], idf_weights=updates[_IDF_WEIGHTS_NAME])
-
- def call(self, inputs):
- if isinstance(inputs, (list, tuple, np.ndarray)):
- inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
-
- if not self.max_tokens and self._vocab_size is None:
- raise ValueError("You must set the layer's vocabulary before calling it. "
- "Either pass a `vocabulary` argument to the layer, or "
- "call `layer.adapt(dataset)` with some sample data.")
- self._called = True
- if self._key_dtype == dtypes.int64 and inputs.dtype == dtypes.int32:
- inputs = math_ops.cast(inputs, dtypes.int64)
- lookup_result = self._table_handler.lookup(inputs)
-
- lookup_checks = []
-
- if self.num_oov_indices == 0 and not self.invert:
- if tf_utils.is_sparse(inputs):
- lookup_values = lookup_result.values
- input_values = inputs.values
- elif tf_utils.is_ragged(inputs):
- lookup_values = lookup_result.flat_values
- input_values = inputs.flat_values
- else:
- lookup_values = lookup_result
- input_values = inputs
- oov_indices = array_ops.where_v2(math_ops.equal(lookup_values, -1))
- oov_inputs = array_ops.gather_nd(input_values, oov_indices)
- msg = string_ops.string_format(
- "When `num_oov_indices=0` all inputs should be in vocabulary, "
- "found OOV values {}, consider setting `num_oov_indices=1`.",
- (oov_inputs,))
- assertion = control_flow_ops.Assert(
- math_ops.equal(array_ops.size(oov_indices), 0), [msg])
- lookup_checks.append(assertion)
-
- with ops.control_dependencies(lookup_checks):
- if self.output_mode == INT:
- return array_ops.identity(lookup_result)
- else:
- return self._encode_output(lookup_result)
-
- def _encode_output(self, lookup_result):
- def expand_dims(inputs, axis):
- if tf_utils.is_sparse(inputs):
- return sparse_ops.sparse_expand_dims(inputs, axis)
- else:
- return array_ops.expand_dims(inputs, axis)
-
- original_shape = lookup_result.shape
- # In all cases, we should uprank scalar input to a single sample.
- if lookup_result.shape.rank == 0:
- lookup_result = expand_dims(lookup_result, -1)
- # One hot will unprank only if the final output dimension is not already 1.
- if self.output_mode == ONE_HOT:
- if lookup_result.shape[-1] != 1:
- lookup_result = expand_dims(lookup_result, -1)
-
- # TODO(b/190445202): remove output rank restriction.
- if lookup_result.shape.rank > 2:
- raise ValueError(
- "Received input shape {}, which would result in output rank {}. "
- "Currently only outputs up to rank 2 are supported for "
- "`output_mode={}`.".format(original_shape, lookup_result.shape.rank,
- self.output_mode))
-
- binary_output = self.output_mode in (MULTI_HOT, ONE_HOT)
- if self._vocab_size and not self.pad_to_max_tokens:
- out_depth = self._vocab_size
- else:
- out_depth = self.max_tokens
- if self.sparse:
- bincounts = category_encoding.sparse_bincount(lookup_result, out_depth,
- binary_output)
- else:
- bincounts = category_encoding.dense_bincount(lookup_result, out_depth,
- binary_output)
-
- if self.output_mode == TF_IDF:
- return math_ops.multiply(bincounts, self.tf_idf_weights)
-
- return bincounts
-
- def _convert_to_ndarray(self, x):
- return np.array(x) if isinstance(x, (list, tuple)) else x
-
- def _oov_start_index(self):
- return 1 if self.mask_token is not None and self.output_mode == INT else 0
-
- def _token_start_index(self):
- return self._oov_start_index() + self.num_oov_indices
-
- @property
- def _trackable_saved_model_saver(self):
- return layer_serialization.IndexLookupLayerSavedModelSaver(self)
-
- # Override points for IntegerLookup and StringLookup.
- def _tensor_vocab_to_numpy(self, vocabulary):
- """Converts a tensor vocabulary to a numpy vocabulary."""
- return vocabulary.numpy()
-
-
-class _IndexLookupAccumulator(
- collections.namedtuple("Accumulator",
- ["data", "count_dict", "per_doc_count_dict"])):
- pass
-
-
-class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
- """Combiner for the IndexLookup preprocessing layer.
-
- This class encapsulates the logic for computing a vocabulary based on the
- frequency of each token.
-
- Attributes:
- vocab_size: (Optional) If set, only the top `vocab_size` tokens (based on
- frequency across the dataset) are retained in the vocabulary. If None, or
- set to a value greater than the total number of distinct tokens in the
- dataset, all tokens are retained.
- """
-
- def __init__(self,
- vocab_size=None,
- mask_value=None,
- oov_value=None,
- compute_idf=False):
- self._vocab_size = vocab_size
- self._mask_value = mask_value
- self._oov_value = oov_value
- self._compute_idf = compute_idf
-
- def compute(self, values, accumulator=None):
- """Compute a step in this computation, returning a new accumulator."""
- values = base_preprocessing_layer.convert_to_list(
- values, sparse_default_value=self._mask_value)
-
- if accumulator is None:
- accumulator = self._create_accumulator()
-
- # TODO(momernick): Benchmark improvements to this algorithm.
- if not isinstance(values, list):
- values = [values]
- for document in values:
- if not isinstance(document, list):
- document = [document]
- if self._compute_idf:
- current_doc_id = accumulator.data["next_doc_id"]
- accumulator.data["next_doc_id"] += 1
- for token in document:
- accumulator.count_dict[token] += 1
- if self._compute_idf:
- doc_count = accumulator.per_doc_count_dict[token]
- if doc_count["last_doc_id"] != current_doc_id:
- doc_count["count"] += 1
- doc_count["last_doc_id"] = current_doc_id
-
- return accumulator
-
- def merge(self, accumulators):
- """Merge several accumulators to a single accumulator."""
- if not accumulators:
- return accumulators
-
- base_accumulator = accumulators[0]
- for accumulator in accumulators[1:]:
- for token, value in accumulator.count_dict.items():
- base_accumulator.count_dict[token] += value
-
- if self._compute_idf:
- base_accumulator.data["next_doc_id"] += accumulator.data["next_doc_id"]
- if self._compute_idf:
- for token, value in accumulator.per_doc_count_dict.items():
- # Any newly created token counts in 'base_accumulator''s
- # per_doc_count_dict will have a last_doc_id of -1. This is always
- # less than the next doc id (which are strictly positive), so any
- # future occurrences are guaranteed to be counted.
- base_accumulator.per_doc_count_dict[token]["count"] += value[
- "count"]
-
- return base_accumulator
-
- def extract(self, accumulator):
- """Convert an accumulator into a dict of output values.
-
- Args:
- accumulator: An accumulator aggregating over the full dataset.
-
- Returns:
- A dict of:
- "vocab": A list of the retained items in the vocabulary.
- """
- vocab_counts = accumulator.count_dict
-
- # Drop special tokens from our vocab.
- if self._mask_value in vocab_counts:
- del vocab_counts[self._mask_value]
- if self._oov_value in vocab_counts:
- del vocab_counts[self._oov_value]
- # Data processed by the accumulator could be tensors, numpy arrays or lists.
- # For tensor string input, values will have been converted into bytes. We
- # need to check the bytes version of special tokens in this case.
- if isinstance(self._mask_value, str):
- mask_value_bytes = compat.as_bytes(self._mask_value)
- if mask_value_bytes in vocab_counts:
- del vocab_counts[mask_value_bytes]
- if isinstance(self._oov_value, str):
- oov_value_bytes = compat.as_bytes(self._oov_value)
- if oov_value_bytes in vocab_counts:
- del vocab_counts[oov_value_bytes]
-
- sorted_counts = sorted(
- vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
- vocab_data = (
- sorted_counts[:self._vocab_size] if self._vocab_size else sorted_counts)
- vocab = [data[0] for data in vocab_data]
-
- if self._compute_idf:
- num_documents = accumulator.data["next_doc_id"]
- document_counts = accumulator.per_doc_count_dict
- doc_counts = [document_counts[token]["count"] for token in vocab]
- idf_weights = self._inverse_document_frequency(doc_counts, num_documents)
- else:
- idf_weights = None
-
- return {_VOCAB_NAME: vocab, _IDF_WEIGHTS_NAME: idf_weights}
-
- def restore(self, output):
- """Create an accumulator based on 'output'."""
- raise NotImplementedError(
- "IndexLookup does not restore or support streaming updates.")
-
- def serialize(self, accumulator):
- """Serialize an accumulator for a remote call."""
- output_dict = {}
- output_dict["vocab"] = list(accumulator.count_dict.keys())
- output_dict["vocab_counts"] = list(accumulator.count_dict.values())
-
- if self._compute_idf:
- output_dict["data"] = accumulator.data
- output_dict["idf_vocab"] = list(accumulator.per_doc_count_dict.keys())
- output_dict["idf_counts"] = [
- counter["count"]
- for counter in accumulator.per_doc_count_dict.values()
- ]
- return compat.as_bytes(json.dumps(output_dict))
-
- def deserialize(self, encoded_accumulator):
- """Deserialize an accumulator received from 'serialize()'."""
- accumulator_dict = json.loads(compat.as_text(encoded_accumulator))
-
- accumulator = self._create_accumulator()
- count_dict = dict(
- zip(accumulator_dict["vocab"], accumulator_dict["vocab_counts"]))
- accumulator.count_dict.update(count_dict)
-
- if self._compute_idf:
- accumulator.data = accumulator_dict["data"]
- create_dict = lambda x: {"count": x, "last_doc_id": -1}
- idf_count_dicts = [
- create_dict(count) for count in accumulator_dict["idf_counts"]
- ]
- idf_dict = dict(zip(accumulator_dict["idf_vocab"], idf_count_dicts))
- accumulator.per_doc_count_dict.update(idf_dict)
- return accumulator
-
- def _create_accumulator(self):
- """Accumulate a sorted array of vocab tokens and corresponding counts."""
-
- if self._compute_idf:
- create_default_dict = lambda: {"count": 0, "last_doc_id": -1}
- per_doc_count_dict = collections.defaultdict(create_default_dict)
- data = {"next_doc_id": 0}
- else:
- per_doc_count_dict = None
- data = None
-
- count_dict = collections.defaultdict(int)
- return _IndexLookupAccumulator(data, count_dict, per_doc_count_dict)
-
- def _inverse_document_frequency(self, document_counts, num_documents):
- """Computes the inverse-document-frequency (IDF) component of TF-IDF.
-
- Uses the default weighting scheme described in
- https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
-
- Args:
- document_counts: An array of the # of documents each token appears in.
- num_documents: An int representing the total number of documents
-
- Returns:
- An array of "inverse document frequency" weights.
- """
- return np.log(1 + num_documents / (1 + np.array(document_counts)))
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
deleted file mode 100644
index fdbc232..0000000
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.index_lookup."""
-
-import os
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.framework import config
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import strategy_combinations
-from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.platform import gfile
-
-
-@ds_combinations.generate(
- combinations.combine(
- strategy=strategy_combinations.all_strategies +
- strategy_combinations.multi_worker_mirrored_strategies,
- mode=["eager"])) # Eager-only, no graph: b/158793009
-class IndexLookupDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def _write_to_temp_file(self, file_name, vocab_list):
- vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
- with gfile.GFile(vocab_path, "w") as writer:
- for vocab in vocab_list:
- writer.write(vocab + "\n")
- writer.flush()
- writer.close()
- return vocab_path
-
- def test_strategy(self, strategy):
- # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
- if backend.is_tpu_strategy(strategy):
- self.skipTest("This test needs MLIR bridge on TPU.")
-
- vocab_data = [[
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ]]
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
- 2, drop_remainder=True)
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.adapt(vocab_dataset)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- model.compile(loss="mse")
- output_dataset = model.predict(input_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_strategy_with_file(self, strategy):
- # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
- if backend.is_tpu_strategy(strategy):
- self.skipTest("This test needs MLIR bridge on TPU.")
-
- vocab_data = ["earth", "wind", "and", "fire"]
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
- 2, drop_remainder=True)
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- model.compile(loss="mse")
- output_dataset = model.predict(input_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_tpu_with_multiple_oov(self, strategy):
- # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
- if backend.is_tpu_strategy(strategy):
- self.skipTest("This test needs MLIR bridge on TPU.")
-
- vocab_data = [[
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ]]
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
- 2, drop_remainder=True)
- expected_output = [[3, 4, 5, 6], [6, 5, 3, 1]]
-
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=2,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.adapt(vocab_dataset)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- model.compile(loss="mse")
- output_dataset = model.predict(input_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-if __name__ == "__main__":
- v2_compat.enable_v2_behavior()
- multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
deleted file mode 100644
index f27d04b..0000000
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ /dev/null
@@ -1,2588 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras text vectorization preprocessing layer."""
-
-import itertools
-import os
-import random
-import string
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python import tf2
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import load
-from tensorflow.python.saved_model import save
-
-
-def zip_and_sort(weight_values):
- keys, values = weight_values
- return sorted(zip(keys, values), key=lambda x: x[1])
-
-
-def _get_end_to_end_test_cases():
- test_cases = (
- {
- "testcase_name":
- "test_strings_soft_vocab_cap",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # accumulator is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- "num_oov_indices": 1,
- "mask_token": "",
- "oov_token": "[OOV]",
- "dtype": dtypes.string,
- },
- "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
- "input_dtype":
- dtypes.string
- },
- {
- "testcase_name":
- "test_inverse_strings_soft_vocab_cap",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # accumulator is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([[2], [3], [4], [1], [1], [4], [2], [5]]),
- "kwargs": {
- "max_tokens": None,
- "num_oov_indices": 1,
- "mask_token": "",
- "oov_token": "[OOV]",
- "dtype": dtypes.string,
- "invert": True
- },
- "expected_output":
- np.array([[b"earth"], [b"wind"], [b"and"], [b"[OOV]"], [b"[OOV]"],
- [b"and"], [b"earth"], [b"fire"]]),
- "input_dtype":
- dtypes.int64
- },
- {
- "testcase_name":
- "test_strings_with_special_tokens",
- # Mask and oov values in the vocab data should be dropped, and mapped
- # to 0 and 1 respectively when calling the layer.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- [""], [""], [""], ["[OOV]"], ["[OOV]"], ["[OOV]"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], [""], ["wind"], ["[OOV]"], ["and"], [""],
- ["fire"], ["and"], ["[OOV]"], ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- "num_oov_indices": 1,
- "mask_token": "",
- "oov_token": "[OOV]",
- "dtype": dtypes.string,
- },
- "expected_output": [[2], [0], [3], [1], [4], [0], [5], [4], [1], [1]],
- "input_dtype":
- dtypes.string
- },
- {
- "testcase_name":
- "test_ints_soft_vocab_cap",
- # Create an array where 1138 is the most frequent term, followed by
- # 1729, then 725, then 42. This ensures that the vocab accumulator
- # is sorting by frequency.
- "vocab_data":
- np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
- [1729], [725], [725]],
- dtype=np.int64),
- "input_data":
- np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
- dtype=np.int64),
- "kwargs": {
- "max_tokens": None,
- "num_oov_indices": 1,
- "mask_token": 0,
- "oov_token": -1,
- "dtype": dtypes.int64,
- },
- "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
- "input_dtype":
- dtypes.int64
- },
- {
- "testcase_name":
- "test_ints_with_special_tokens",
- # Mask and oov values in the vocab data should be dropped, and mapped
- # to 0 and 1 respectively when calling the layer.
- "vocab_data":
- np.array([[42], [1138], [1138], [1138], [1138], [0], [0], [0],
- [-1], [-1], [-1], [1729], [1729], [1729], [725], [725]],
- dtype=np.int64),
- "input_data":
- np.array([[1138], [0], [1729], [-1], [725], [0], [42], [725],
- [-1], [4]],
- dtype=np.int64),
- "kwargs": {
- "max_tokens": None,
- "num_oov_indices": 1,
- "mask_token": 0,
- "oov_token": -1,
- "dtype": dtypes.int64,
- },
- "expected_output": [[2], [0], [3], [1], [4], [0], [5], [4], [1], [1]],
- "input_dtype":
- dtypes.int64
- },
- {
- "testcase_name":
- "test_strings_hard_vocab_cap",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # accumulator is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "num_oov_indices": 1,
- "mask_token": "",
- "oov_token": "[OOV]",
- "dtype": dtypes.string,
- },
- "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
- "input_dtype":
- dtypes.string
- },
- {
- "testcase_name":
- "test_inverse_strings_hard_vocab_cap",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # accumulator is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([[2], [3], [4], [1], [1], [4], [2], [5]]),
- "kwargs": {
- "max_tokens": 5,
- "num_oov_indices": 1,
- "mask_token": "",
- "oov_token": "[OOV]",
- "dtype": dtypes.string,
- "invert": True
- },
- "expected_output":
- np.array([[b"earth"], [b"wind"], [b"and"], [b"[OOV]"], [b"[OOV]"],
- [b"and"], [b"earth"], [b"[OOV]"]]),
- "input_dtype":
- dtypes.int64
- },
- {
- "testcase_name":
- "test_ints_hard_vocab_cap",
- # Create an array where 1138 is the most frequent term, followed by
- # 1729, then 725, then 42. This ensures that the vocab accumulator
- # is sorting by frequency.
- "vocab_data":
- np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
- [1729], [725], [725]],
- dtype=np.int64),
- "input_data":
- np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
- dtype=np.int64),
- "kwargs": {
- "max_tokens": 5,
- "num_oov_indices": 1,
- "mask_token": 0,
- "oov_token": -1,
- "dtype": dtypes.int64,
- },
- "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
- "input_dtype":
- dtypes.int64
- },
- {
- "testcase_name":
- "test_ints_tf_idf_output",
- "vocab_data":
- np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
- [1729], [725], [725]]),
- "input_data":
- np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]]),
- "kwargs": {
- "max_tokens": 5,
- "num_oov_indices": 1,
- "mask_token": 0,
- "oov_token": -1,
- "output_mode": index_lookup.TF_IDF,
- "dtype": dtypes.int64,
- },
- "expected_output": [[0, 1.098612, 0, 0, 0], [0, 0, 1.252763, 0, 0],
- [0, 0, 0, 1.466337, 0], [0, 0, 0, 0, 1.7917595],
- [0, 0, 0, 0, 1.7917595], [0, 0, 0, 1.4663371, 0],
- [0, 1.098612, 0, 0, 0], [1.402368, 0, 0, 0, 0]],
- "input_dtype":
- dtypes.int64
- },
- {
- "testcase_name":
- "test_strings_tf_idf_output",
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "num_oov_indices": 1,
- "mask_token": "",
- "oov_token": "[OOV]",
- "output_mode": index_lookup.TF_IDF,
- "dtype": dtypes.string,
- },
- "expected_output": [[0, 1.098612, 0, 0, 0], [0, 0, 1.252763, 0, 0],
- [0, 0, 0, 1.466337, 0], [0, 0, 0, 0, 1.7917595],
- [0, 0, 0, 0, 1.7917595], [0, 0, 0, 1.4663371, 0],
- [0, 1.098612, 0, 0, 0], [1.402368, 0, 0, 0, 0]],
- "input_dtype":
- dtypes.string
- },
- )
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupLayerTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters(*_get_end_to_end_test_cases())
- def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
- use_dataset, expected_output,
- input_dtype):
- cls = index_lookup.IndexLookup
- if "invert" in kwargs and kwargs["invert"]:
- expected_output_dtype = kwargs["dtype"]
- elif "output_mode" in kwargs and kwargs["output_mode"] != index_lookup.INT:
- expected_output_dtype = dtypes.float32
- else:
- expected_output_dtype = dtypes.int64
-
- input_shape = input_data.shape
-
- if use_dataset:
- # Keras APIs expect batched datasets.
- # TODO(rachelim): `model.predict` predicts the result on each
- # dataset batch separately, then tries to concatenate the results
- # together. When the results have different shapes on the non-concat
- # axis (which can happen in the output_mode = INT case for
- # IndexLookup), the concatenation fails. In real use cases, this may
- # not be an issue because users are likely to pipe the preprocessing layer
- # into other keras layers instead of predicting it directly. A workaround
- # for these unit tests is to have the dataset only contain one batch, so
- # no concatenation needs to happen with the result. For consistency with
- # numpy input, we should make `predict` join differently shaped results
- # together sensibly, with 0 padding.
- input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
- input_shape[0])
- vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
- input_shape[0])
-
- with CustomObjectScope({"IndexLookup": cls}):
- output_data = testing_utils.layer_test(
- cls,
- kwargs=kwargs,
- input_shape=input_shape,
- input_data=input_data,
- input_dtype=input_dtype,
- expected_output_dtype=expected_output_dtype,
- validate_training=False,
- adapt_data=vocab_data)
- if "invert" in kwargs and kwargs["invert"]:
- self.assertAllEqual(expected_output, output_data)
- else:
- self.assertAllClose(expected_output, output_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingInputTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=["fire", "michigan"],
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_sparse_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 32], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant(
- [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_ragged_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
- dtype=np.int64)
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int32_input_with_int64_keys(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
- dtype=np.int32)
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingMultiOOVTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_string_input_multi_bucket(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]], values=["fire", "ohio"], dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [6, 2]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=2,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_sparse_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 133], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [6, 2]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=2,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_string_input_multi_bucket(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
- ["fire", "and", "earth",
- "ohio"]])
- expected_output = [[3, 4, 6], [6, 5, 3, 2]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=2,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_ragged_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
- dtype=np.int64)
- expected_output = [[3, 4, 6], [6, 5, 3, 2]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=2,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingAdaptTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_adapt(self):
- vocab_data = sparse_tensor.SparseTensor(
- indices=[[0, 0], [0, 1], [1, 2]],
- values=["michigan", "fire", "michigan"],
- dense_shape=[3, 4])
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
-
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.adapt(vocab_dataset)
- expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
- self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
- def test_ragged_adapt(self):
- vocab_data = ragged_factory_ops.constant([["michigan"],
- ["fire", "michigan"]])
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
-
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.adapt(vocab_dataset)
- expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
- self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
- def test_sparse_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 32], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant(
- [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_ragged_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
- dtype=np.int64)
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_single_string_generator_dataset(self):
-
- def word_gen():
- for _ in itertools.count(1):
- yield "".join(random.choice(string.ascii_letters) for i in range(2))
-
- ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
- tensor_shape.TensorShape([]))
- batched_ds = ds.take(2)
- input_t = keras.Input(shape=(), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=10,
- num_oov_indices=0,
- mask_token=None,
- oov_token=None,
- dtype=dtypes.string)
- _ = layer(input_t)
- layer.adapt(batched_ds)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupOutputTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def _write_to_temp_file(self, file_name, vocab_list):
- vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
- with gfile.GFile(vocab_path, "w") as writer:
- for vocab in vocab_list:
- writer.write(vocab + "\n")
- writer.flush()
- writer.close()
- return vocab_path
-
- def test_int_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_shape(self):
- input_data = keras.Input(batch_size=16, shape=(4,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=2,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- int_data = layer(input_data)
- self.assertAllEqual(int_data.shape.as_list(), [16, 4])
-
- def test_int_output_no_reserved_zero(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=None,
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_no_oov(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- valid_input = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", ""]])
- invalid_input = np.array([["earth", "wind", "and", "michigan"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=0,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(valid_input)
- self.assertAllEqual(expected_output, output_data)
- with self.assertRaisesRegex(errors.InvalidArgumentError,
- "found OOV values.*michigan"):
- _ = model.predict(invalid_input)
-
- def test_int_output_no_oov_ragged(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- valid_input = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", ""]])
- invalid_input = np.array([["earth", "wind", "and", "michigan"],
- ["fire", "and", "earth", "michigan"]])
- valid_input = ragged_tensor.RaggedTensor.from_tensor(valid_input)
- invalid_input = ragged_tensor.RaggedTensor.from_tensor(invalid_input)
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=0,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(valid_input)
- self.assertAllEqual(expected_output, output_data)
- with self.assertRaisesRegex(errors.InvalidArgumentError,
- "found OOV values.*michigan"):
- _ = model.predict(invalid_input)
-
- def test_int_output_no_oov_sparse(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- valid_input = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", ""]])
- invalid_input = np.array([["earth", "wind", "and", "michigan"],
- ["fire", "and", "earth", "michigan"]])
- valid_input = sparse_ops.from_dense(valid_input)
- invalid_input = sparse_ops.from_dense(invalid_input)
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=0,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(valid_input)
- self.assertAllEqual(expected_output,
- sparse_ops.sparse_tensor_to_dense(output_data))
- with self.assertRaisesRegex(errors.InvalidArgumentError,
- "found OOV values.*michigan"):
- _ = model.predict(invalid_input)
-
- def test_int_output_explicit_vocab(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_one_hot_output_hard_maximum(self):
- """Check binary output when pad_to_max_tokens=True."""
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array(["earth", "wind", "and", "fire", "michigan", ""])
- expected_output = [
- [0, 1, 0, 0, 0, 0],
- [0, 0, 1, 0, 0, 0],
- [0, 0, 0, 1, 0, 0],
- [0, 0, 0, 0, 1, 0],
- [1, 0, 0, 0, 0, 0],
- [0, 0, 0, 0, 0, 0],
- ]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=6,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.ONE_HOT,
- pad_to_max_tokens=True,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- binary_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=binary_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_one_hot_output_soft_maximum(self):
- """Check binary output when pad_to_max_tokens=False."""
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array(["earth", "wind", "and", "fire", "michigan", ""])
- expected_output = [
- [0, 1, 0, 0, 0],
- [0, 0, 1, 0, 0],
- [0, 0, 0, 1, 0],
- [0, 0, 0, 0, 1],
- [1, 0, 0, 0, 0],
- [0, 0, 0, 0, 0],
- ]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.ONE_HOT,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- binary_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=binary_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_one_hot_output_shape(self):
- inputs = keras.Input(batch_size=16, shape=(1,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=["earth"],
- max_tokens=2,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.ONE_HOT,
- dtype=dtypes.string)
- outputs = layer(inputs)
- self.assertAllEqual(outputs.shape.as_list(), [16, 2])
-
- def test_multi_hot_output_hard_maximum(self):
- """Check binary output when pad_to_max_tokens=True."""
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire", ""],
- ["fire", "fire", "and", "earth", "michigan"]])
- expected_output = [
- [0, 1, 1, 1, 1, 0],
- [1, 1, 0, 1, 1, 0],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=6,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- pad_to_max_tokens=True,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- binary_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=binary_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_multi_hot_output_no_oov(self):
- """Check binary output when pad_to_max_tokens=True."""
- vocab_data = ["earth", "wind", "and", "fire"]
- valid_input = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", ""]])
- invalid_input = np.array([["earth", "wind", "and", "michigan"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [
- [1, 1, 1, 1, 0],
- [1, 0, 1, 1, 0],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=0,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- pad_to_max_tokens=True,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- binary_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=binary_data)
- output_data = model.predict(valid_input)
- self.assertAllEqual(expected_output, output_data)
- with self.assertRaisesRegex(errors.InvalidArgumentError,
- "found OOV values.*michigan"):
- _ = model.predict(invalid_input)
-
- def test_multi_hot_output_hard_maximum_multiple_adapts(self):
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
- adapt_data = ["earth", "earth", "earth", "earth", "wind", "wind", "wind"]
- first_expected_output = [
- [1, 1, 1, 0, 0],
- [1, 1, 0, 0, 0],
- ]
- second_adapt_data = [
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ]
- second_expected_output = [
- [0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- pad_to_max_tokens=True,
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- # Test the first adapt
- layer.adapt(adapt_data)
- first_output = model.predict(input_array)
- # Test the second adapt
- layer.adapt(second_adapt_data)
- second_output = model.predict(input_array)
- self.assertAllEqual(first_expected_output, first_output)
- self.assertAllEqual(second_expected_output, second_output)
-
- def test_multi_hot_output_soft_maximum(self):
- """Check multi_hot output when pad_to_max_tokens=False."""
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire", ""],
- ["fire", "and", "earth", "michigan", ""]])
- expected_output = [
- [0, 1, 1, 1, 1],
- [1, 1, 0, 1, 1],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- binary_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=binary_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_multi_hot_output_shape(self):
- input_data = keras.Input(batch_size=16, shape=(4,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=2,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- dtype=dtypes.string)
- binary_data = layer(input_data)
- self.assertAllEqual(binary_data.shape.as_list(), [16, 2])
-
- def test_count_output_hard_maxiumum(self):
- """Check count output when pad_to_max_tokens=True."""
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "wind", ""],
- ["fire", "fire", "fire", "michigan", ""]])
- expected_output = [
- [0, 1, 2, 1, 0, 0],
- [1, 0, 0, 0, 3, 0],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=6,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.COUNT,
- pad_to_max_tokens=True,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- count_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=count_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_count_output_soft_maximum(self):
- """Check count output when pad_to_max_tokens=False."""
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "wind", ""],
- ["fire", "fire", "fire", "michigan", ""]])
- expected_output = [
- [0, 1, 2, 1, 0],
- [1, 0, 0, 0, 3],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.COUNT,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- count_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=count_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_count_output_shape(self):
- input_data = keras.Input(batch_size=16, shape=(4,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=2,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.COUNT,
- dtype=dtypes.string)
- count_data = layer(input_data)
- self.assertAllEqual(count_data.shape.as_list(), [16, 2])
-
- def test_ifidf_output_hard_maximum(self):
- """Check tf-idf output when pad_to_max_tokens=True."""
- vocab_data = ["earth", "wind", "and", "fire"]
- # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
- idf_weights = [.4, .25, .75, .6]
- input_array = np.array([["earth", "wind", "and", "earth", ""],
- ["ohio", "fire", "earth", "michigan", ""]])
- expected_output = [
- [0.00, 0.80, 0.25, 0.75, 0.00, 0.00],
- [1.00, 0.40, 0.00, 0.00, 0.60, 0.00],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=6,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.TF_IDF,
- pad_to_max_tokens=True,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
- layer_output = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=layer_output)
- output_dataset = model.predict(input_array)
- self.assertAllClose(expected_output, output_dataset)
-
- def test_ifidf_output_soft_maximum(self):
- """Check tf-idf output when pad_to_max_tokens=False."""
- vocab_data = ["earth", "wind", "and", "fire"]
- # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
- idf_weights = [.4, .25, .75, .6]
- input_array = np.array([["earth", "wind", "and", "earth", ""],
- ["ohio", "fire", "earth", "michigan", ""]])
- expected_output = [
- [0.00, 0.80, 0.25, 0.75, 0.00],
- [1.00, 0.40, 0.00, 0.00, 0.60],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.TF_IDF,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
- layer_output = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=layer_output)
- output_dataset = model.predict(input_array)
- self.assertAllClose(expected_output, output_dataset)
-
- def test_ifidf_output_shape(self):
- input_data = keras.Input(batch_size=16, shape=(4,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=2,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.COUNT,
- dtype=dtypes.string)
- layer_output = layer(input_data)
- self.assertAllEqual(layer_output.shape.as_list(), [16, 2])
-
- def test_int_output_file_vocab(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 0, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_non_int_output_file_vocab_in_tf_function(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = constant_op.constant(
- [["earth", "wind", "and", "fire", ""],
- ["fire", "and", "earth", "michigan", ""]],
- dtype=dtypes.string)
-
- expected_output = [
- [0, 1, 1, 1, 1],
- [1, 1, 0, 1, 1],
- ]
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- @def_function.function
- def compute(data):
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- dtype=dtypes.string)
- return layer(data)
-
- output_dataset = compute(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_file_vocab_and_list_vocab_identical_attrs(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- file_layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- list_layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
- self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
- expected_vocab_size = 6
- self.assertAllEqual(expected_vocab_size, list_layer.vocab_size())
- self.assertAllEqual(list_layer.get_vocabulary(),
- file_layer.get_vocabulary())
- self.assertAllEqual(list_layer.vocab_size(), file_layer.vocab_size())
-
- # We expect the weights to be DIFFERENT in these cases.
- expected_weights = (["", "earth", "wind", "and", "fire"], [0, 2, 3, 4, 5])
- sorted_weights = zip_and_sort(expected_weights)
- self.assertAllEqual(sorted_weights, zip_and_sort(list_layer.get_weights()))
- self.assertAllEqual(0, len(file_layer.get_weights()))
-
- def test_file_vocab_and_list_vocab_identical_attrs_multi_oov(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- file_layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=2,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- list_layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=2,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- expected_vocab = ["", "[OOV]", "[OOV]", "earth", "wind", "and", "fire"]
- self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
- expected_vocab_size = 7
- self.assertAllEqual(expected_vocab_size, list_layer.vocab_size())
- self.assertAllEqual(list_layer.get_vocabulary(),
- file_layer.get_vocabulary())
- self.assertAllEqual(list_layer.vocab_size(), file_layer.vocab_size())
-
- expected_weights = (["", "earth", "wind", "and", "fire"], [0, 3, 4, 5, 6])
- sorted_weights = zip_and_sort(expected_weights)
- self.assertAllEqual(sorted_weights, zip_and_sort(list_layer.get_weights()))
- self.assertAllEqual(0, len(file_layer.get_weights()))
-
- def test_file_vocab_and_list_vocab_identical_attrs_no_mask(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- file_layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=2,
- mask_token=None,
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- list_layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=2,
- mask_token=None,
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- expected_vocab = ["[OOV]", "[OOV]", "earth", "wind", "and", "fire"]
- self.assertAllEqual(expected_vocab, list_layer.get_vocabulary())
- expected_vocab_size = 6
- self.assertAllEqual(expected_vocab_size, list_layer.vocab_size())
- self.assertAllEqual(list_layer.get_vocabulary(),
- file_layer.get_vocabulary())
- self.assertAllEqual(list_layer.vocab_size(), file_layer.vocab_size())
-
- expected_weights = (["earth", "wind", "and", "fire"], [2, 3, 4, 5])
- sorted_weights = zip_and_sort(expected_weights)
- self.assertAllEqual(sorted_weights, zip_and_sort(list_layer.get_weights()))
- self.assertAllEqual(0, len(file_layer.get_weights()))
-
- def test_int_output_file_vocab_no_mask(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 0, 1, 0]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- mask_token=None,
- num_oov_indices=1,
- oov_token="[OOV]",
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_file_vocab_no_oov_or_mask(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "wind", "earth", "and"]])
- expected_output = [[0, 1, 2, 3], [3, 1, 0, 2]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- mask_token=None,
- num_oov_indices=0,
- oov_token=None,
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_file_vocab_inversion(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([[1, 2, 3, 4], [4, 0, 1, 0]])
- expected_output = [["earth", "wind", "and", "fire"],
- ["fire", "[OOV]", "earth", "[OOV]"]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
- idata = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- mask_token=None,
- num_oov_indices=1,
- oov_token="[OOV]",
- dtype=dtypes.string)
- _ = layer(idata)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
-
- invert_layer = index_lookup.IndexLookup(
- vocabulary=layer.get_vocabulary(),
- max_tokens=None,
- oov_token="[OOV]",
- mask_token=None,
- num_oov_indices=1,
- invert=True,
- dtype=dtypes.string)
- int_data = invert_layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_int_file_vocab(self):
- vocab_data = ["10", "20", "30", "40"]
- input_array = np.array([[10, 20, 30, 40], [40, 0, 10, 42]])
- expected_output = [[2, 3, 4, 5], [5, 0, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_file_vocab_setting_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_file,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- with self.assertRaisesRegexp(RuntimeError, "file path"):
- layer.set_vocabulary(vocab_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupVocabularyTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- def test_int_output_explicit_vocab(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_explicit_vocab_with_special_tokens(self):
- vocab_data = ["", "[OOV]", "earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_get_vocabulary_no_special_tokens(self):
- vocab_data = ["", "[OOV]", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary(include_special_tokens=False)
- self.assertAllEqual(returned_vocab, ["wind", "and", "fire"])
- self.assertAllEqual(layer.vocabulary_size(), 5)
-
- def test_vocab_with_max_cap(self):
- vocab_data = ["", "[OOV]", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
- self.assertAllEqual(layer.vocabulary_size(), 5)
-
- def test_int_vocab_with_max_cap(self):
- vocab_data = [0, -1, 42, 1276, 1138]
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
- self.assertAllEqual(layer.vocabulary_size(), 5)
-
- def test_vocab_with_multiple_oov_indices(self):
- vocab_data = ["", "[OOV]", "[OOV]", "[OOV]", "wind"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=3,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
-
- def test_int_vocab_with_multiple_oov_indices(self):
- vocab_data = [0, -1, -1, -1, 42]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=3,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
-
- def test_non_unique_vocab_fails(self):
- vocab_data = ["earth", "wind", "and", "fire", "fire"]
- with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
- _ = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
- def test_vocab_with_oov_and_wrong_mask_fails(self):
- vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_oov_and_no_mask_fails(self):
- vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_mask_but_no_oov_fails(self):
- vocab_data = ["", "earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_repeated_element_fails(self):
- vocab_data = ["earth", "earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_reserved_oov_element_fails(self):
- vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_reserved_mask_element_fails(self):
- vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="mask_token",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_set_after_call_pad_to_max_false_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- pad_to_max_tokens=False,
- output_mode=index_lookup.MULTI_HOT,
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- # Calling the layer should lock the vocabulary.
- _ = layer([["earth"]])
- with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_idf_weights_non_tfidf_output_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- weight_data = [1, 1, 1, 1, 1]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.MULTI_HOT,
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError,
- "`idf_weights` should only be set if"):
- layer.set_vocabulary(vocab_data, idf_weights=weight_data)
-
- def test_vocab_with_idf_weights_length_mismatch_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- weight_data = [1, 1, 1, 1, 1] # too long
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.TF_IDF,
- dtype=dtypes.string)
- with self.assertRaisesRegex(
- ValueError, "`idf_weights` must be the same length as vocab"):
- layer.set_vocabulary(vocab_data, idf_weights=weight_data)
-
- def test_vocab_without_idf_weights_tfidf_output_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- output_mode=index_lookup.TF_IDF,
- dtype=dtypes.string)
- with self.assertRaisesRegex(
- ValueError, "`idf_weights` must be set if output_mode is TF_IDF"):
- layer.set_vocabulary(vocab_data)
-
- def test_non_unique_int_vocab_fails(self):
- vocab_data = [12, 13, 14, 15, 15]
- with self.assertRaisesRegex(ValueError, "repeated term.*15"):
- _ = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
-
- def test_int_vocab_with_oov_and_wrong_mask_fails(self):
- vocab_data = [1234, -1, 11, 21, 13, 14]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- with self.assertRaisesRegex(ValueError, "does not have the mask token `0`"):
- layer.set_vocabulary(vocab_data)
-
- def test_int_vocab_with_oov_and_no_mask_fails(self):
- vocab_data = [-1, 11, 12, 13, 14]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- with self.assertRaisesRegex(ValueError, "Reserved OOV"):
- layer.set_vocabulary(vocab_data)
-
- def test_int_vocab_with_mask_but_no_oov_fails(self):
- vocab_data = [0, 11, 12, 13, 14]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- with self.assertRaisesRegex(ValueError, "does not have the OOV token `-1`"):
- layer.set_vocabulary(vocab_data)
-
- def test_int_vocab_with_repeated_element_fails(self):
- vocab_data = [11, 11, 34, 23, 124]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- with self.assertRaisesRegex(ValueError, "repeated term.*11"):
- layer.set_vocabulary(vocab_data)
-
- def test_int_vocab_with_reserved_oov_element_fails(self):
- vocab_data = [14, 38, -1, 34, 3, 84]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- with self.assertRaisesRegex(ValueError, "Reserved OOV"):
- layer.set_vocabulary(vocab_data)
-
- def test_int_vocab_with_reserved_mask_element_fails(self):
- vocab_data = [125, 0, 3, 4, 94]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
- with self.assertRaisesRegex(ValueError, "Reserved mask"):
- layer.set_vocabulary(vocab_data)
-
- def test_no_vocab_file_string_fails(self):
- with self.assertRaisesRegex(ValueError, ".*non_existent_file.*"):
- _ = index_lookup.IndexLookup(
- vocabulary="non_existent_file",
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupInverseVocabularyTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_int_output_explicit_vocab(self):
- vocab_data = ["", "[OOV]", "earth", "wind", "and", "fire"]
- input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
- expected_output = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "[OOV]"]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- invert=True)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_vocab_with_max_cap(self):
- vocab_data = ["", "[OOV]", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- invert=True)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
-
- def test_int_vocab_with_max_cap(self):
- vocab_data = [0, -1, 42, 1276, 1138]
- layer = index_lookup.IndexLookup(
- max_tokens=5,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64,
- invert=True)
- layer.set_vocabulary(vocab_data)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
-
- def test_non_unique_vocab_fails(self):
- vocab_data = ["earth", "wind", "and", "fire", "fire"]
- with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
- _ = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- invert=True)
-
- def test_non_int_output_fails(self):
- with self.assertRaisesRegex(ValueError, "`output_mode` must be int"):
- _ = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- output_mode=index_lookup.COUNT,
- invert=True)
-
- def test_vocab_with_repeated_element_fails(self):
- vocab_data = ["earth", "earth", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- invert=True)
- with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_vocab_with_reserved_mask_element_fails(self):
- vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="mask_token",
- oov_token="[OOV]",
- dtype=dtypes.string,
- invert=True)
- with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_non_unique_int_vocab_fails(self):
- vocab_data = [12, 13, 14, 15, 15]
- with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
- _ = index_lookup.IndexLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64,
- invert=True)
-
- def test_int_vocab_with_repeated_element_fails(self):
- vocab_data = [11, 11, 34, 23, 124]
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token=0,
- oov_token=-1,
- dtype=dtypes.int64,
- invert=True)
- with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
- layer.set_vocabulary(vocab_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupErrorTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_too_long_vocab_fails_in_single_setting(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- layer = index_lookup.IndexLookup(
- max_tokens=4,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- with self.assertRaisesRegex(ValueError,
- "vocabulary larger than the maximum vocab.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_zero_max_tokens_fails(self):
- with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
- _ = index_lookup.IndexLookup(
- max_tokens=0,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupSavingTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def _write_to_temp_file(self, file_name, vocab_list):
- vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
- with gfile.GFile(vocab_path, "w") as writer:
- for vocab in vocab_list:
- writer.write(vocab + "\n")
- writer.flush()
- writer.close()
- return vocab_path
-
- def test_vocabulary_persistence_across_saving(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_vocabulary_persistence_file_across_cloning(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Clone the model.
- new_model = keras.models.clone_model(model)
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, new_model)
-
- # Validate correctness of the new model.
- new_output_dataset = new_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_persistence_file_vocabs_tf_save_tf_load(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- save.save(obj=model, export_dir=output_path)
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = load.load(output_path)
- f = loaded_model.signatures["serving_default"]
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = f(constant_op.constant(input_array))["index_lookup"]
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_vocabulary_persistence_file_vocab_keras_save_tf_load(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = load.load(output_path)
- f = loaded_model.signatures["serving_default"]
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = f(constant_op.constant(input_array))["index_lookup"]
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_persistence_file_vocab_keras_save_keras_load(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
- gfile.Remove(vocab_file)
-
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- # Try re-saving the layer. This simulates saving a layer contained at
- # a hub Module.
- input_data_2 = keras.Input(shape=(None,), dtype=dtypes.string)
- output_2 = loaded_model(input_data_2)
- model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
- new_output_dataset = model_2.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model_2")
- model_2.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_persistence_file_vocab_keras_save_keras_load_tf_save_tf_load(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
- gfile.Remove(vocab_file)
-
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- # Try re-saving the layer. This simulates saving a layer contained at
- # a hub Module.
- input_data_2 = keras.Input(shape=(None,), dtype=dtypes.string)
- output_2 = loaded_model(input_data_2)
- model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
- new_output_dataset = model_2.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model_2")
- save.save(model_2, output_path)
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = load.load(output_path)
- f = loaded_model.signatures["serving_default"]
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = f(constant_op.constant(input_array))["model"]
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_persistence_file_vocab_keras_save_keras_load_keras_save_keras_load(
- self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_file = self._write_to_temp_file("temp", vocab_data)
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = index_lookup.IndexLookup(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
- gfile.Remove(vocab_file)
-
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- # Try re-saving the layer. This simulates saving a layer contained at
- # a hub Module.
- input_data_2 = keras.Input(shape=(None,), dtype=dtypes.string)
- output_2 = loaded_model(input_data_2)
- model_2 = keras.Model(inputs=input_data_2, outputs=output_2)
- new_output_dataset = model_2.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model_2")
- model_2.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"IndexLookup": index_lookup.IndexLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = model_2.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
- def test_static_table_config_weight_data_transfer_succeeds(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- vocab_file = self._write_to_temp_file("temp", vocab_data)
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
-
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- # Build and validate a golden model.
- layer_cls = index_lookup.IndexLookup
- layer = layer_cls(
- max_tokens=None,
- num_oov_indices=1,
- mask_token="",
- oov_token="[OOV]",
- dtype=dtypes.string,
- vocabulary=vocab_file)
- config = layer.get_config()
- weights = layer.get_weights()
-
- layer = layer_cls.from_config(config)
- layer.set_weights(weights)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- output = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=output)
-
- new_output_dataset = model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupStringCombinerTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def compare_text_accumulators(self, a, b, msg=None):
- if a is None or b is None:
- self.assertAllEqual(a, b, msg=msg)
-
- self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
-
- compare_accumulators = compare_text_accumulators
-
- def update_accumulator(self, accumulator, data):
- accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
-
- return accumulator
-
- def test_combiner_api_compatibility_int_mode(self):
- data = np.array([["earth", "wind", "and", "fire"],
- ["earth", "wind", "and", "michigan"]])
- combiner = index_lookup._IndexLookupCombiner()
- expected_accumulator_output = {
- "vocab": np.array(["and", "earth", "wind", "fire", "michigan"]),
- "counts": np.array([2, 2, 2, 1, 1]),
- }
- expected_extract_output = {
- "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
- "idf_weights": None,
- }
- expected_accumulator = combiner._create_accumulator()
- expected_accumulator = self.update_accumulator(expected_accumulator,
- expected_accumulator_output)
- self.validate_accumulator_serialize_and_deserialize(combiner, data,
- expected_accumulator)
- self.validate_accumulator_uniqueness(combiner, data)
- self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
- # TODO(askerryryan): Add tests confirming equivalence to behavior of
- # existing tf.keras.preprocessing.text.Tokenizer.
- @parameterized.named_parameters(
- {
- "testcase_name":
- "top_k_smaller_than_full_vocab",
- "data":
- np.array([["earth", "wind"], ["fire", "wind"], ["and"],
- ["fire", "wind"]]),
- "vocab_size":
- 3,
- "expected_accumulator_output": {
- "vocab": np.array(["wind", "fire", "earth", "and"]),
- "counts": np.array([3, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array(["wind", "fire", "earth"]),
- "idf_weights": None,
- },
- },
- {
- "testcase_name":
- "top_k_larger_than_full_vocab",
- "data":
- np.array([["earth", "wind"], ["fire", "wind"], ["and"],
- ["fire", "wind"]]),
- "vocab_size":
- 10,
- "expected_accumulator_output": {
- "vocab": np.array(["wind", "fire", "earth", "and"]),
- "counts": np.array([3, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array(["wind", "fire", "earth", "and"]),
- "idf_weights": None,
- },
- },
- {
- "testcase_name":
- "no_top_k",
- "data":
- np.array([["earth", "wind"], ["fire", "wind"], ["and"],
- ["fire", "wind"]]),
- "vocab_size":
- None,
- "expected_accumulator_output": {
- "vocab": np.array(["wind", "fire", "earth", "and"]),
- "counts": np.array([3, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array(["wind", "fire", "earth", "and"]),
- "idf_weights": None,
- },
- },
- {
- "testcase_name": "single_element_per_row",
- "data": np.array([["earth"], ["wind"], ["fire"], ["wind"], ["and"]]),
- "vocab_size": 3,
- "expected_accumulator_output": {
- "vocab": np.array(["wind", "and", "earth", "fire"]),
- "counts": np.array([2, 1, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array(["wind", "fire", "earth"]),
- "idf_weights": None,
- },
- },
- # Which tokens are retained are based on global frequency, and thus are
- # sensitive to frequency within a document. In contrast, because idf only
- # considers the presence of a token in a document, it is insensitive
- # to the frequency of the token within the document.
- {
- "testcase_name":
- "retained_tokens_sensitive_to_within_document_frequency",
- "data":
- np.array([["earth", "earth"], ["wind", "wind"], ["fire", "fire"],
- ["wind", "wind"], ["and", "michigan"]]),
- "vocab_size":
- 3,
- "expected_accumulator_output": {
- "vocab": np.array(["wind", "earth", "fire", "and", "michigan"]),
- "counts": np.array([4, 2, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array(["wind", "fire", "earth"]),
- "idf_weights": None,
- },
- })
- def test_combiner_computation(self, data, vocab_size,
- expected_accumulator_output,
- expected_extract_output):
- combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
- expected_accumulator = combiner._create_accumulator()
- expected_accumulator = self.update_accumulator(expected_accumulator,
- expected_accumulator_output)
- self.validate_accumulator_computation(combiner, data, expected_accumulator)
- self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- def compare_text_accumulators(self, a, b, msg=None):
- if a is None or b is None:
- self.assertAllEqual(a, b, msg=msg)
-
- self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
-
- compare_accumulators = compare_text_accumulators
-
- def update_accumulator(self, accumulator, data):
- accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
-
- return accumulator
-
- def test_combiner_api_compatibility_int_mode(self):
- data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
- combiner = index_lookup._IndexLookupCombiner()
- expected_accumulator_output = {
- "vocab": np.array([1138, 725, 42, 1729, 203]),
- "counts": np.array([2, 2, 2, 1, 1]),
- }
- expected_extract_output = {
- "vocab": np.array([1138, 725, 42, 1729, 203]),
- "idf_weights": None,
- }
- expected_accumulator = combiner._create_accumulator()
- expected_accumulator = self.update_accumulator(expected_accumulator,
- expected_accumulator_output)
- self.validate_accumulator_serialize_and_deserialize(combiner, data,
- expected_accumulator)
- self.validate_accumulator_uniqueness(combiner, data)
- self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
- # TODO(askerryryan): Add tests confirming equivalence to behavior of
- # existing tf.keras.preprocessing.text.Tokenizer.
- @parameterized.named_parameters(
- {
- "testcase_name": "top_k_smaller_than_full_vocab",
- "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
- "vocab_size": 3,
- "expected_accumulator_output": {
- "vocab": np.array([1138, 1729, 725, 42]),
- "counts": np.array([3, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array([1138, 1729, 725]),
- "idf_weights": None,
- },
- },
- {
- "testcase_name": "top_k_larger_than_full_vocab",
- "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
- "vocab_size": 10,
- "expected_accumulator_output": {
- "vocab": np.array([1138, 1729, 725, 42]),
- "counts": np.array([3, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array([1138, 1729, 725, 42]),
- "idf_weights": None,
- },
- },
- {
- "testcase_name": "no_top_k",
- "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
- "vocab_size": None,
- "expected_accumulator_output": {
- "vocab": np.array([1138, 1729, 725, 42]),
- "counts": np.array([3, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array([1138, 1729, 725, 42]),
- "idf_weights": None,
- },
- },
- {
- "testcase_name": "single_element_per_row",
- "data": np.array([[42], [1138], [1729], [1138], [725]]),
- "vocab_size": 3,
- "expected_accumulator_output": {
- "vocab": np.array([1138, 1729, 725, 42]),
- "counts": np.array([2, 1, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array([1138, 1729, 725]),
- "idf_weights": None,
- },
- },
- # Which tokens are retained are based on global frequency, and thus are
- # sensitive to frequency within a document. In contrast, because idf only
- # considers the presence of a token in a document, it is insensitive
- # to the frequency of the token within the document.
- {
- "testcase_name":
- "retained_tokens_sensitive_to_within_document_frequency",
- "data":
- np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
- [725, 203]]),
- "vocab_size":
- 3,
- "expected_accumulator_output": {
- "vocab": np.array([1138, 42, 1729, 725, 203]),
- "counts": np.array([4, 2, 2, 1, 1]),
- },
- "expected_extract_output": {
- "vocab": np.array([1138, 1729, 42]),
- "idf_weights": None,
- },
- })
- def test_combiner_computation(self, data, vocab_size,
- expected_accumulator_output,
- expected_extract_output):
- combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
- expected_accumulator = combiner._create_accumulator()
- expected_accumulator = self.update_accumulator(expected_accumulator,
- expected_accumulator_output)
- self.validate_accumulator_computation(combiner, data, expected_accumulator)
- self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
deleted file mode 100644
index d6ce3b7..0000000
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras string lookup preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.keras.layers.preprocessing import table_utils
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export("keras.layers.experimental.preprocessing.IntegerLookup", v1=[])
-class IntegerLookup(index_lookup.IndexLookup):
- """Reindex integer inputs to be in a contiguous range, via a dict lookup.
-
- This layer maps a set of arbitrary integer input tokens into indexed
- integer output via a table-based vocabulary lookup. The layer's output indices
- will be contiguously arranged up to the maximum vocab size, even if the input
- tokens are non-continguous or unbounded. The layer supports multiple options
- for encoding the output via `output_mode`, and has optional support for
- out-of-vocabulary (OOV) tokens and masking.
-
- The vocabulary for the layer can be supplied on construction or learned via
- `adapt()`. During `adapt()`, the layer will analyze a data set, determine the
- frequency of individual integer tokens, and create a vocabulary from them. If
- the vocabulary is capped in size, the most frequent tokens will be used to
- create the vocabulary and all others will be treated as OOV.
-
- There are two possible output modes for the layer.
- When `output_mode` is `"int"`,
- input integers are converted to their index in the vocabulary (an integer).
- When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`, input integers
- are encoded into an array where each dimension corresponds to an element in
- the vocabulary.
-
- The vocabulary can optionally contain a mask token as well as an OOV token
- (which can optionally occupy multiple indices in the vocabulary, as set
- by `num_oov_indices`).
- The position of these tokens in the vocabulary is fixed. When `output_mode` is
- `"int"`, the vocabulary will begin with the mask token at index 0, followed by
- OOV indices, followed by the rest of the vocabulary. When `output_mode` is
- `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will begin with OOV
- indices and instances of the mask token will be dropped.
-
- Args:
- max_tokens: The maximum size of the vocabulary for this layer. If None,
- there is no cap on the size of the vocabulary. Note that this size
- includes the OOV and mask tokens. Default to None.
- num_oov_indices: The number of out-of-vocabulary tokens to use. If this
- value is more than 1, OOV inputs are modulated to determine their OOV
- value. If this value is 0, OOV inputs will cause an error when calling the
- layer. Defaults to 1.
- mask_token: An integer token that represents masked inputs. When
- `output_mode` is `"int"`, the token is included in vocabulary and mapped
- to index 0. In other output modes, the token will not appear in the
- vocabulary and instances of the mask token in the input will be dropped.
- If set to None, no mask term will be added. Defaults to None.
- oov_token: Only used when `invert` is True. The token to return for OOV
- indices. Defaults to -1.
- vocabulary: An optional list of integer tokens, or a path to a text file
- containing a vocabulary to load into this layer. The file should contain
- one integer token per line. If the list or file contains the same token
- multiple times, an error will be thrown.
- invert: Only valid when `output_mode` is `"int"`. If True, this layer will
- map indices to vocabulary items instead of mapping vocabulary items to
- indices. Default to False.
- output_mode: Specification for the output of the layer. Defaults to `"int"`.
- Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
- `"tf_idf"` configuring the layer as follows:
- - `"int"`: Return the vocabulary indices of the input tokens.
- - `"one_hot"`: Encodes each individual element in the input into an
- array the same size as the vocabulary, containing a 1 at the element
- index. If the last dimension is size 1, will encode on that dimension.
- If the last dimension is not size 1, will append a new dimension for
- the encoded output.
- - `"multi_hot"`: Encodes each sample in the input into a single array
- the same size as the vocabulary, containing a 1 for each vocabulary
- term present in the sample. Treats the last dimension as the sample
- dimension, if input shape is (..., sample_length), output shape will
- be (..., num_tokens).
- - `"count"`: As `"multi_hot"`, but the int array contains a count of the
- number of times the token at that index appeared in the sample.
- - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
- find the value in each token slot.
- pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
- `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
- padded to `max_tokens` even if the number of unique tokens in the
- vocabulary is less than max_tokens, resulting in a tensor of shape
- [batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
- sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
- `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
- dense `Tensor`. Defaults to False.
-
- Examples:
-
- **Creating a lookup layer with a known vocabulary**
-
- This example creates a lookup layer with a pre-existing vocabulary.
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]]) # Note OOV tokens
- >>> layer = IntegerLookup(vocabulary=vocab)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[1, 3, 4],
- [4, 0, 2]])>
-
- **Creating a lookup layer with an adapted vocabulary**
-
- This example creates a lookup layer and generates the vocabulary by analyzing
- the dataset.
-
- >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
- >>> layer = IntegerLookup()
- >>> layer.adapt(data)
- >>> layer.get_vocabulary()
- [-1, 42, 1138, 1000, 36, 12]
-
- Note that the OOV token -1 have been added to the vocabulary. The remaining
- tokens are sorted by frequency (42, which has 2 occurrences, is first) then
- by inverse sort order.
-
- >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
- >>> layer = IntegerLookup()
- >>> layer.adapt(data)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[5, 2, 1],
- [1, 3, 4]])>
-
-
- **Lookups with multiple OOV indices**
-
- This example demonstrates how to use a lookup layer with multiple OOV indices.
- When a layer is created with more than one OOV index, any OOV tokens are
- hashed into the number of OOV buckets, distributing OOV tokens in a
- deterministic fashion across the set.
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([[12, 1138, 42], [37, 1000, 36]])
- >>> layer = IntegerLookup(vocabulary=vocab, num_oov_indices=2)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[2, 4, 5],
- [1, 0, 3]])>
-
- Note that the output for OOV token 37 is 1, while the output for OOV token
- 1000 is 0. The in-vocab terms have their output index increased by 1 from
- earlier examples (12 maps to 2, etc) in order to make space for the extra OOV
- token.
-
- **One-hot output**
-
- Configure the layer with `output_mode='one_hot'`. Note that the first
- `num_oov_indices` dimensions in the ont_hot encoding represent OOV values.
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([12, 36, 1138, 42, 7]) # Note OOV tokens
- >>> layer = IntegerLookup(vocabulary=vocab, output_mode='one_hot')
- >>> layer(data)
- <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
- array([[0., 1., 0., 0., 0.],
- [0., 0., 1., 0., 0.],
- [0., 0., 0., 1., 0.],
- [0., 0., 0., 0., 1.],
- [1., 0., 0., 0., 0.]], dtype=float32)>
-
- **Multi-hot output**
-
- Configure the layer with `output_mode='multi_hot'`. Note that the first
- `num_oov_indices` dimensions in the multi_hot encoding represent OOV tokens
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
- >>> layer = IntegerLookup(vocabulary=vocab, output_mode='multi_hot')
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0., 1., 0., 1., 1.],
- [1., 0., 1., 0., 1.]], dtype=float32)>
-
- **Token count output**
-
- Configure the layer with `output_mode='count'`. As with multi_hot output, the
- first `num_oov_indices` dimensions in the output represent OOV tokens.
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
- >>> layer = IntegerLookup(vocabulary=vocab, output_mode='count')
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0., 1., 0., 1., 2.],
- [2., 0., 1., 0., 1.]], dtype=float32)>
-
- **TF-IDF output**
-
- Configure the layer with `output_mode='tf_idf'`. As with multi_hot output, the
- first `num_oov_indices` dimensions in the output represent OOV tokens.
-
- Each token bin will output `token_count * idf_weight`, where the idf weights
- are the inverse document frequency weights per token. These should be provided
- along with the vocabulary. Note that the `idf_weight` for OOV tokens will
- default to the average of all idf weights passed in.
-
- >>> vocab = [12, 36, 1138, 42]
- >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
- >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
- >>> layer = IntegerLookup(output_mode='tf_idf')
- >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0. , 0.25, 0. , 0.6 , 0.8 ],
- [1.0 , 0. , 0.75, 0. , 0.4 ]], dtype=float32)>
-
- To specify the idf weights for oov tokens, you will need to pass the entire
- vocabularly including the leading oov token.
-
- >>> vocab = [-1, 12, 36, 1138, 42]
- >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
- >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
- >>> layer = IntegerLookup(output_mode='tf_idf')
- >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0. , 0.25, 0. , 0.6 , 0.8 ],
- [1.8 , 0. , 0.75, 0. , 0.4 ]], dtype=float32)>
-
- When adapting the layer in tf_idf mode, each input sample will be considered a
- document, and idf weight per token will be calculated as
- `log(1 + num_documents / (1 + token_document_count))`.
-
- **Inverse lookup**
-
- This example demonstrates how to map indices to tokens using this layer. (You
- can also use adapt() with inverse=True, but for simplicity we'll pass the
- vocab in this example.)
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])
- >>> layer = IntegerLookup(vocabulary=vocab, invert=True)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[ 12, 1138, 42],
- [ 42, -1, 36]])>
-
- Note that the first index correspond to the oov token by default.
-
-
- **Forward and inverse lookup pairs**
-
- This example demonstrates how to use the vocabulary of a standard lookup
- layer to create an inverse lookup layer.
-
- >>> vocab = [12, 36, 1138, 42]
- >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
- >>> layer = IntegerLookup(vocabulary=vocab)
- >>> i_layer = IntegerLookup(vocabulary=layer.get_vocabulary(), invert=True)
- >>> int_data = layer(data)
- >>> i_layer(int_data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[ 12, 1138, 42],
- [ 42, -1, 36]])>
-
- In this example, the input token 1000 resulted in an output of -1, since
- 1000 was not in the vocabulary - it got represented as an OOV, and all OOV
- tokens are returned as -1 in the inverse layer. Also, note that for the
- inverse to work, you must have already set the forward layer vocabulary
- either directly or via `adapt()` before calling `get_vocabulary()`.
- """
-
- def __init__(self,
- max_tokens=None,
- num_oov_indices=1,
- mask_token=None,
- oov_token=-1,
- vocabulary=None,
- invert=False,
- output_mode=index_lookup.INT,
- sparse=False,
- pad_to_max_tokens=False,
- **kwargs):
- allowed_dtypes = [dtypes.int64]
-
- # Support deprecated args for this layer.
- if "max_values" in kwargs:
- logging.log_first_n(logging.WARN,
- "max_values is deprecated, use max_tokens instead.",
- 1)
- max_tokens = kwargs["max_values"]
- del kwargs["max_values"]
- if "mask_value" in kwargs:
- logging.log_first_n(logging.WARN,
- "mask_value is deprecated, use mask_token instead.",
- 1)
- mask_token = kwargs["mask_value"]
- del kwargs["mask_value"]
- if "oov_value" in kwargs:
- logging.log_first_n(logging.WARN,
- "oov_value is deprecated, use oov_token instead.", 1)
- oov_token = kwargs["oov_value"]
- del kwargs["oov_value"]
-
- if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
- raise ValueError("The value of the dtype argument for IntegerLookup may "
- "only be one of %s." % (allowed_dtypes,))
-
- if "dtype" not in kwargs:
- kwargs["dtype"] = dtypes.int64
-
- # If max_tokens is set, the token must be greater than 1 - otherwise we
- # are creating a 0-element vocab, which doesn't make sense.
- if max_tokens is not None and max_tokens <= 1:
- raise ValueError("If set, max_tokens must be greater than 1. "
- "You passed %s" % (max_tokens,))
-
- if num_oov_indices < 0:
- raise ValueError(
- "num_oov_indices must be greater than or equal to 0. You passed %s" %
- (num_oov_indices,))
-
- super(IntegerLookup, self).__init__(
- max_tokens=max_tokens,
- num_oov_indices=num_oov_indices,
- mask_token=mask_token,
- oov_token=oov_token,
- vocabulary=vocabulary,
- invert=invert,
- output_mode=output_mode,
- sparse=sparse,
- pad_to_max_tokens=pad_to_max_tokens,
- **kwargs)
-
- def set_vocabulary(self, vocabulary, idf_weights=None):
- if isinstance(vocabulary, str):
- if self.output_mode == index_lookup.TF_IDF:
- raise RuntimeError(
- "Setting vocabulary directly from a file is not "
- "supported in TF-IDF mode, since this layer cannot "
- "read files containing TF-IDF weight data. Please "
- "read the file using Python and set the vocabulary "
- "and weights by passing lists or arrays to the "
- "set_vocabulary function's `vocabulary` and `idf_weights` "
- "args.")
- vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
- vocabulary = [int(v) for v in vocabulary]
- super().set_vocabulary(vocabulary, idf_weights=idf_weights)
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
deleted file mode 100644
index d659cd9..0000000
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
+++ /dev/null
@@ -1,632 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras text vectorization preprocessing layer."""
-
-import gc
-import itertools
-import os
-import random
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python import tf2
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers.preprocessing import integer_lookup
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-def _get_end_to_end_test_cases():
- test_cases = (
- {
- "testcase_name":
- "test_ints_soft_vocab_cap",
- # Create an array where 1138 is the most frequent term, followed by
- # 1729, then 725, then 42. This ensures that the vocab accumulator
- # is sorting by frequency.
- "vocab_data":
- np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
- [1729], [725], [725]],
- dtype=np.int64),
- "input_data":
- np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
- dtype=np.int64),
- "kwargs": {
- "max_tokens": None,
- "dtype": dtypes.int64,
- },
- "expected_output": [[1], [2], [3], [4], [4], [3], [1], [0]],
- "input_dtype":
- dtypes.int64
- },)
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupLayerTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters(*_get_end_to_end_test_cases())
- def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
- use_dataset, expected_output,
- input_dtype):
- cls = integer_lookup.IntegerLookup
- expected_output_dtype = dtypes.int64
- input_shape = input_data.shape
-
- if use_dataset:
- # Keras APIs expect batched datasets.
- # TODO(rachelim): `model.predict` predicts the result on each
- # dataset batch separately, then tries to concatenate the results
- # together. When the results have different shapes on the non-concat
- # axis (which can happen in the output_mode = INT case for
- # IntegerLookup), the concatenation fails. In real use cases, this may
- # not be an issue because users are likely to pipe the preprocessing layer
- # into other keras layers instead of predicting it directly. A workaround
- # for these unit tests is to have the dataset only contain one batch, so
- # no concatenation needs to happen with the result. For consistency with
- # numpy input, we should make `predict` join differently shaped results
- # together sensibly, with 0 padding.
- input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
- input_shape[0])
- vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
- input_shape[0])
-
- with CustomObjectScope({"IntegerLookup": cls}):
- output_data = testing_utils.layer_test(
- cls,
- kwargs=kwargs,
- input_shape=input_shape,
- input_data=input_data,
- input_dtype=input_dtype,
- expected_output_dtype=expected_output_dtype,
- validate_training=False,
- adapt_data=vocab_data)
- self.assertAllClose(expected_output, output_data)
-
- def test_layer_with_list_input(self):
- vocab = [12, 36, 1138, 42]
- data = [[12, 1138, 42], [42, 1000, 36]] # Note OOV tokens
- layer = integer_lookup.IntegerLookup(vocabulary=vocab)
- output = layer(data)
- expected_output = np.array([[1, 3, 4], [4, 0, 2]])
- self.assertEqual(output.numpy().tolist(), expected_output.tolist())
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingInputTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 32], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [4, 0]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- layer = integer_lookup.IntegerLookup(max_tokens=None)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
- dtype=np.int64)
- expected_output = [[1, 2, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
- layer = integer_lookup.IntegerLookup(max_tokens=None)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingMultiOOVTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 133], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [6, 2]
- expected_dense_shape = [3, 4]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
- layer = integer_lookup.IntegerLookup(
- max_tokens=None,
- dtype=dtypes.int64,
- num_oov_indices=2,
- mask_token=0,
- oov_token=-1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array, steps=1)
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
- dtype=np.int64)
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
- layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=2)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingAdaptTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_adapt(self):
- vocab_data = sparse_tensor.SparseTensor(
- indices=[[0, 0], [0, 1], [1, 2]],
- values=[203, 1729, 203],
- dense_shape=[3, 4])
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
-
- layer = integer_lookup.IntegerLookup()
- layer.adapt(vocab_dataset)
- expected_vocabulary = [-1, 203, 1729]
- self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
- def test_ragged_adapt(self):
- vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
-
- layer = integer_lookup.IntegerLookup()
- layer.adapt(vocab_dataset)
- expected_vocabulary = [-1, 203, 1729]
- self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
-
- def test_single_int_generator_dataset(self):
-
- def word_gen():
- for _ in itertools.count(1):
- yield random.randint(0, 100)
-
- ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
- tensor_shape.TensorShape([]))
- batched_ds = ds.take(2)
- input_t = keras.Input(shape=(), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- max_tokens=10, num_oov_indices=0, mask_token=None, oov_token=None)
- _ = layer(input_t)
- layer.adapt(batched_ds)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupOutputTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_int_output(self):
- vocab_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup()
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_output_shape(self):
- input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(max_tokens=2, num_oov_indices=1)
- int_data = layer(input_data)
- self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
-
- def test_int_output_with_mask(self):
- vocab_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(max_tokens=None, mask_token=0)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_explicit_vocab(self):
- vocab_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- )
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_explicit_vocab_with_special_tokens(self):
- vocab_data = [0, -1, 42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data,
- max_tokens=None,
- mask_token=0,
- )
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_no_oov(self):
- vocab_data = [42, 1138, 725, 1729]
- valid_input = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 0]])
- invalid_input = np.array([[42, 1138, 725, 203], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data, mask_token=0, num_oov_indices=0)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(valid_input)
- self.assertAllEqual(expected_output, output_data)
- with self.assertRaisesRegex(errors.InvalidArgumentError,
- "found OOV values.*203"):
- _ = model.predict(invalid_input)
-
- def test_inverse_output(self):
- vocab_data = [-1, 42, 1138, 725, 1729]
- input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
- expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(invert=True)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_forward_backward_explicit_vocab(self):
- vocab_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
- inverse_layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data, invert=True)
- int_data = layer(input_data)
- inverse_data = inverse_layer(int_data)
- model = keras.Model(inputs=input_data, outputs=inverse_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_forward_backward_adapted_vocab(self):
- adapt_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup()
- layer.adapt(adapt_data)
- inverse_layer = integer_lookup.IntegerLookup(
- vocabulary=layer.get_vocabulary(), invert=True)
- int_data = layer(input_data)
- inverse_data = inverse_layer(int_data)
- model = keras.Model(inputs=input_data, outputs=inverse_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupVocabularyTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def _write_to_temp_file(self, file_name, vocab_list):
- vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
- with gfile.GFile(vocab_path, "w") as writer:
- for vocab in vocab_list:
- writer.write(str(vocab) + "\n")
- writer.flush()
- writer.close()
- return vocab_path
-
- def test_int_output_explicit_vocab(self):
- vocab_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_no_vocab(self):
- with self.assertRaisesRegex(ValueError,
- "You must set the layer's vocabulary"):
- layer = integer_lookup.IntegerLookup()
- layer([[1]])
-
- def test_one_hot_output(self):
- vocab_data = [2, 3, 4, 5]
- input_array = np.array([2, 3, 4, 5, 6])
- expected_output = [
- [0, 1, 0, 0, 0],
- [0, 0, 1, 0, 0],
- [0, 0, 0, 1, 0],
- [0, 0, 0, 0, 1],
- [1, 0, 0, 0, 0],
- ]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data, output_mode="one_hot")
- res = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=res)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_multi_hot_output(self):
- vocab_data = [2, 3, 4, 5]
- input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 2]])
- expected_output = [[0, 1, 1, 1, 0], [1, 1, 0, 0, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data, output_mode="multi_hot")
- res = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=res)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_count_output(self):
- vocab_data = [2, 3, 4, 5]
- input_array = np.array([[2, 2, 3, 4], [0, 1, 5, 6]])
- expected_output = [[0, 2, 1, 1, 0], [3, 0, 0, 0, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data, output_mode="count")
- res = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=res)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_sparse_output(self):
- vocab_data = [2, 3, 4, 5]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_data, output_mode="multi_hot", sparse=True)
- res = layer(input_data)
- self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
-
- def test_get_vocab_returns_int(self):
- vocab_data = [42, 1138, 725, 1729]
- expected_vocab = [-1, 42, 1138, 725, 1729]
- layer = integer_lookup.IntegerLookup(vocabulary=vocab_data)
- layer_vocab = layer.get_vocabulary()
- self.assertAllEqual(expected_vocab, layer_vocab)
- self.assertIsInstance(layer_vocab[0], np.int64)
-
- def test_int_output_explicit_vocab_from_file(self):
- vocab_list = [42, 1138, 725, 1729]
- vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(vocabulary=vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_inverted_vocab_from_file(self):
- vocab_list = [42, 1138, 725, 1729]
- vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
- input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
- expected_output = [[42, 1138, 725, 1729], [1729, 725, 42, -1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(vocabulary=vocab_path, invert=True)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_inverted_vocab_from_file_with_mask(self):
- vocab_list = [42, 1138, 725, 1729]
- vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
- input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
- expected_output = [[42, 1138, 725, 1729], [1729, 725, 42, -10]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(
- vocabulary=vocab_path, invert=True, mask_value=-10)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_explicit_vocab_from_file_via_setter(self):
- vocab_list = [42, 1138, 725, 1729]
- vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup()
- layer.set_vocabulary(vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_non_unique_vocab_fails(self):
- vocab_data = [42, 1138, 725, 1729, 1729]
- with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
- _ = integer_lookup.IntegerLookup(vocabulary=vocab_data)
-
- def test_non_unique_vocab_from_file_fails(self):
- vocab_list = [42, 1138, 725, 1729, 42]
- vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
- with self.assertRaisesRegex(
- errors_impl.FailedPreconditionError,
- ".*HashTable has different value for same key.*42.*"):
- _ = integer_lookup.IntegerLookup(vocabulary=vocab_path)
-
- def test_tensor_vocab(self):
- vocab_data = [-1, 42, 1138, 725, 1729]
- vocab_tensor = constant_op.constant(vocab_data, dtypes.int64)
- layer = integer_lookup.IntegerLookup(vocabulary=vocab_tensor)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
- self.assertAllEqual(layer.vocabulary_size(), 5)
- fn = def_function.function(lambda: layer.set_vocabulary(vocab_tensor))
- with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"):
- fn()
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupErrorTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_too_long_vocab_fails_in_single_setting(self):
- vocab_data = [42, 1138, 725, 1729]
-
- layer = integer_lookup.IntegerLookup(max_tokens=4, num_oov_indices=1)
- with self.assertRaisesRegex(ValueError,
- "vocabulary larger than the maximum vocab.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_zero_max_tokens_fails(self):
- with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
- _ = integer_lookup.IntegerLookup(max_tokens=0, num_oov_indices=1)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupSavingTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def tearDown(self):
- keras.backend.clear_session()
- gc.collect()
- super(IntegerLookupSavingTest, self).tearDown()
-
- def test_vocabulary_persistence_across_saving(self):
- vocab_data = [42, 1138, 725, 1729]
- input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=1)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = keras.models.load_model(
- output_path,
- custom_objects={"IntegerLookup": integer_lookup.IntegerLookup})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllEqual(new_output_dataset, expected_output)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
deleted file mode 100644
index a83742f..0000000
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Normalization preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_impl
-from tensorflow.python.ops import variables
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.layers.experimental.preprocessing.Normalization')
-class Normalization(base_preprocessing_layer.PreprocessingLayer):
- """Feature-wise normalization of the data.
-
- This layer will coerce its inputs into a distribution centered around
- 0 with standard deviation 1. It accomplishes this by precomputing the mean and
- variance of the data, and calling (input-mean)/sqrt(var) at runtime.
-
- What happens in `adapt`: Compute mean and variance of the data and store them
- as the layer's weights. `adapt` should be called before `fit`, `evaluate`,
- or `predict`.
-
- Args:
- axis: Integer or tuple of integers, the axis or axes that should be
- "kept". These axes are not be summed over when calculating the
- normalization statistics. By default the last axis, the `features` axis
- is kept and any `space` or `time` axes are summed. Each element in the
- the axes that are kept is normalized independently. If `axis` is set to
- 'None', the layer will perform scalar normalization (dividing the input
- by a single scalar value). The `batch` axis, 0, is always summed over
- (`axis=0` is not allowed).
- mean: The mean value(s) to use during normalization. The passed value(s)
- will be broadcast to the shape of the kept axes above; if the value(s)
- cannot be broadcast, an error will be raised when this layer's build()
- method is called.
- variance: The variance value(s) to use during normalization. The passed
- value(s) will be broadcast to the shape of the kept axes above; if the
- value(s) cannot be broadcast, an error will be raised when this layer's
- build() method is called.
-
- Examples:
-
- Calculate the mean and variance by analyzing the dataset in `adapt`.
-
- >>> adapt_data = np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32)
- >>> input_data = np.array([[1.], [2.], [3.]], np.float32)
- >>> layer = Normalization()
- >>> layer.adapt(adapt_data)
- >>> layer(input_data)
- <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
- array([[-1.4142135 ],
- [-0.70710677],
- [ 0. ]], dtype=float32)>
-
- Pass the mean and variance directly.
-
- >>> input_data = np.array([[1.], [2.], [3.]], np.float32)
- >>> layer = Normalization(mean=3., variance=2.)
- >>> layer(input_data)
- <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
- array([[-1.4142135 ],
- [-0.70710677],
- [ 0. ]], dtype=float32)>
- """
-
- def __init__(self, axis=-1, mean=None, variance=None, **kwargs):
- super().__init__(streaming=True, **kwargs)
-
- # Standardize `axis` to a tuple.
- if axis is None:
- axis = ()
- elif isinstance(axis, int):
- axis = (axis,)
- else:
- axis = tuple(axis)
- if 0 in axis:
- raise ValueError('The argument \'axis\' may not be 0.')
- self.axis = axis
-
- # Set `mean` and `variance` if passed.
- if isinstance(mean, variables.Variable):
- raise ValueError('Normalization does not support passing a Variable '
- 'for the `mean` init arg.')
- if isinstance(variance, variables.Variable):
- raise ValueError('Normalization does not support passing a Variable '
- 'for the `variance` init arg.')
- if (mean is not None) != (variance is not None):
- raise ValueError(
- 'When setting values directly, both `mean` and `variance` '
- 'must be set. Got mean: {} and variance: {}'.format(mean, variance))
- self.input_mean = mean
- self.input_variance = variance
-
- def build(self, input_shape):
- super().build(input_shape)
-
- input_shape = tensor_shape.TensorShape(input_shape).as_list()
- if len(input_shape) == 1:
- input_shape = input_shape + [1]
- ndim = len(input_shape)
-
- if any(a < 1 - ndim or a >= ndim for a in self.axis):
- raise ValueError('All `axis` values must be in the range '
- '[1 - ndim, ndim - 1]. Found '
- 'ndim: `{}`, axis: {}'.format(ndim, self.axis))
-
- # Axes to be kept, replacing negative values with positive equivalents.
- # Sorted to avoid transposing axes.
- self._keep_axis = sorted([d if d >= 0 else d + ndim for d in self.axis])
- # Axes to be reduced.
- self._reduce_axis = [d for d in range(ndim) if d not in self._keep_axis]
- # 1 if an axis should be reduced, 0 otherwise.
- self._reduce_axis_mask = [
- 0 if d in self._keep_axis else 1 for d in range(ndim)
- ]
- # Broadcast any reduced axes.
- self._broadcast_shape = [
- input_shape[d] if d in self._keep_axis else 1 for d in range(ndim)
- ]
- mean_and_var_shape = tuple(input_shape[d] for d in self._keep_axis)
-
- if self.input_mean is None:
- self.adapt_mean = self.add_weight(
- name='mean',
- shape=mean_and_var_shape,
- dtype=self.dtype,
- initializer=init_ops.zeros_initializer,
- trainable=False)
- self.adapt_variance = self.add_weight(
- name='variance',
- shape=mean_and_var_shape,
- dtype=self.dtype,
- initializer=init_ops.ones_initializer,
- trainable=False)
- self.count = self.add_weight(
- name='count',
- shape=(),
- dtype=dtypes.int64,
- initializer=init_ops.zeros_initializer,
- trainable=False)
- self.finalize_state()
- else:
- # In the no adapt case, make constant tensors for mean and variance with
- # proper broadcast shape for use during call.
- mean = self.input_mean * np.ones(mean_and_var_shape)
- variance = self.input_variance * np.ones(mean_and_var_shape)
- mean = array_ops.reshape(mean, self._broadcast_shape)
- variance = array_ops.reshape(variance, self._broadcast_shape)
- self.mean = math_ops.cast(mean, self.compute_dtype)
- self.variance = math_ops.cast(variance, self.compute_dtype)
-
- def update_state(self, data):
- if self.input_mean is not None:
- raise ValueError(
- 'Cannot `adapt` a Normalization layer that is initialized with '
- 'static `mean` and `variance`, you passed mean {} and variance {}.'
- .format(self.input_mean, self.input_variance))
-
- if not self.built:
- raise RuntimeError('`build` must be called before `update_state`.')
-
- data = self._standardize_inputs(data)
- data = math_ops.cast(data, self.adapt_mean.dtype)
- batch_mean, batch_variance = nn_impl.moments_v2(
- data, axes=self._reduce_axis)
- batch_shape = array_ops.shape(data, out_type=self.count.dtype)
- batch_reduce_shape = array_ops.gather(batch_shape, self._reduce_axis)
- batch_count = math_ops.reduce_prod(batch_reduce_shape)
-
- total_count = batch_count + self.count
- batch_weight = (
- math_ops.cast(batch_count, dtype=self.dtype) /
- math_ops.cast(total_count, dtype=self.dtype))
- existing_weight = 1. - batch_weight
-
- total_mean = self.adapt_mean * existing_weight + batch_mean * batch_weight
- # The variance is computed using the lack-of-fit sum of squares
- # formula (see https://en.wikipedia.org/wiki/Lack-of-fit_sum_of_squares).
- total_variance = ((self.adapt_variance +
- (self.adapt_mean - total_mean)**2) * existing_weight +
- (batch_variance +
- (batch_mean - total_mean)**2) * batch_weight)
- self.adapt_mean.assign(total_mean)
- self.adapt_variance.assign(total_variance)
- self.count.assign(total_count)
-
- def merge_state(self, layers):
- layers = layers + [self]
- for l in layers:
- if l.input_mean is not None:
- raise ValueError(
- 'Cannot merge Normalization layer {} that has initialized with '
- '`mean` and `variance`, you passed `mean={}` and `variance={}`.'
- .format(l.name, l.input_mean, l.input_variance))
- if not l.built:
- raise ValueError(
- 'Cannot merge Normalization layer {}, it has no state. You need to '
- 'call `adapt` on this layer before merging.'.format(l.name))
-
- layer_counts = [l.count for l in layers]
- layer_means = [l.adapt_mean for l in layers]
- layer_variances = [l.adapt_variance for l in layers]
-
- total_count = math_ops.reduce_sum(layer_counts)
- layer_weightings = (
- math_ops.cast(layer_counts, self.dtype) /
- math_ops.cast(total_count, self.dtype))
- layer_weightings = array_ops.reshape(
- layer_weightings,
- shape=[len(layers)] + [1] * self.adapt_mean.shape.rank)
-
- total_mean = math_ops.reduce_sum(layer_means * layer_weightings, axis=0)
- inter_layer_variances = (layer_means - total_mean)**2
- total_variance = math_ops.reduce_sum(
- ((layer_variances + inter_layer_variances) * layer_weightings), axis=0)
-
- self.adapt_mean.assign(total_mean)
- self.adapt_variance.assign(total_variance)
- self.count.assign(total_count)
- self.finalize_state()
-
- def reset_state(self): # pylint: disable=method-hidden
- if self.input_mean is not None or not self.built:
- return
-
- self.adapt_mean.assign(array_ops.zeros_like(self.adapt_mean))
- self.adapt_variance.assign(array_ops.ones_like(self.adapt_variance))
- self.count.assign(array_ops.zeros_like(self.count))
-
- def finalize_state(self):
- if self.input_mean is not None or not self.built:
- return
-
- # In the adapt case, we make constant tensors for mean and variance with
- # proper broadcast shape and dtype each time `finalize_state` is called.
- self.mean = array_ops.reshape(self.adapt_mean, self._broadcast_shape)
- self.mean = math_ops.cast(self.mean, self.compute_dtype)
- self.variance = array_ops.reshape(self.adapt_variance,
- self._broadcast_shape)
- self.variance = math_ops.cast(self.variance, self.compute_dtype)
-
- def call(self, inputs):
- inputs = self._standardize_inputs(inputs)
- # The base layer automatically casts floating-point inputs, but we
- # explicitly cast here to also allow integer inputs to be passed
- inputs = math_ops.cast(inputs, self.compute_dtype)
- return ((inputs - self.mean) /
- math_ops.maximum(math_ops.sqrt(self.variance), backend.epsilon()))
-
- def compute_output_shape(self, input_shape):
- return input_shape
-
- def compute_output_signature(self, input_spec):
- return input_spec
-
- def get_config(self):
- config = super().get_config()
- config.update({
- 'axis': self.axis,
- 'mean': self._convert_to_list(self.input_mean),
- 'variance': self._convert_to_list(self.input_variance),
- })
- return config
-
- def _standardize_inputs(self, inputs):
- inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
- if inputs.shape.rank == 0:
- inputs = array_ops.reshape(inputs, [1, 1])
- elif inputs.shape.rank == 1:
- inputs = array_ops.expand_dims(inputs, 1)
- return inputs
-
- def _convert_to_list(self, inputs):
- if tensor_util.is_tensor(inputs):
- inputs = inputs.numpy()
- if isinstance(inputs, (np.ndarray)):
- inputs = inputs.tolist()
- inputs = list(inputs)
- return inputs
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
deleted file mode 100644
index 6050e9e6..0000000
--- a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.normalization."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import strategy_combinations
-from tensorflow.python.keras.layers.preprocessing import normalization
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-
-
-def _get_layer_computation_test_cases():
- test_cases = ({
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
- "axis": -1,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element"
- }, {
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
- "axis": None,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element_none_axis"
- }, {
- "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
- "axis": None,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element_none_axis_flat_data"
- }, {
- "adapt_data":
- np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
- np.float32),
- "axis":
- 1,
- "test_data":
- np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
- np.float32),
- "expected":
- np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
- [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
- np.float32),
- "testcase_name":
- "3d_internal_axis"
- }, {
- "adapt_data":
- np.array(
- [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
- np.float32),
- "axis": (1, 2),
- "test_data":
- np.array(
- [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
- np.float32),
- "expected":
- np.array(
- [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
- np.float32),
- "testcase_name":
- "3d_multiple_axis"
- })
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@ds_combinations.generate(
- combinations.times(
- combinations.combine(
- strategy=strategy_combinations.all_strategies +
- strategy_combinations.multi_worker_mirrored_strategies,
- mode=["eager"]), _get_layer_computation_test_cases()))
-class NormalizationTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_layer_computation(self, strategy, adapt_data, axis, test_data,
- use_dataset, expected):
- input_shape = tuple([None for _ in range(test_data.ndim - 1)])
- if use_dataset:
- # Keras APIs expect batched datasets
- adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
- test_data.shape[0] // 2)
- test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
- test_data.shape[0] // 2)
-
- with strategy.scope():
- input_data = keras.Input(shape=input_shape)
- layer = normalization.Normalization(axis=axis)
- layer.adapt(adapt_data)
- output = layer(input_data)
- model = keras.Model(input_data, output)
- output_data = model.predict(test_data)
- self.assertAllClose(expected, output_data)
-
-
-if __name__ == "__main__":
- v2_compat.enable_v2_behavior()
- multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
deleted file mode 100644
index 2e35d46..0000000
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ /dev/null
@@ -1,408 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for keras.layers.preprocessing.normalization."""
-
-import os
-
-from absl.testing import parameterized
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers.preprocessing import normalization
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import load
-from tensorflow.python.saved_model import save
-
-
-def _get_layer_computation_test_cases():
- test_cases = ({
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
- "axis": -1,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element"
- }, {
- "adapt_data": np.array([[1], [2], [3], [4], [5]], dtype=np.int32),
- "axis": -1,
- "test_data": np.array([[1], [2], [3]], np.int32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_int_data"
- }, {
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
- "axis": None,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element_none_axis"
- }, {
- "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
- "axis": None,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element_none_axis_flat_data"
- }, {
- "adapt_data":
- np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
- np.float32),
- "axis":
- 1,
- "test_data":
- np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
- np.float32),
- "expected":
- np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
- [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
- np.float32),
- "testcase_name":
- "3d_internal_axis"
- }, {
- "adapt_data":
- np.array(
- [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
- np.float32),
- "axis": (1, 2),
- "test_data":
- np.array(
- [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
- np.float32),
- "expected":
- np.array(
- [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
- np.float32),
- "testcase_name":
- "3d_multiple_axis"
- }, {
- "adapt_data":
- np.zeros((3, 4)),
- "axis": -1,
- "test_data":
- np.zeros((3, 4)),
- "expected":
- np.zeros((3, 4)),
- "testcase_name":
- "zero_variance"
- })
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@keras_parameterized.run_all_keras_modes
-class NormalizationTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_broadcasting_during_direct_setting(self):
- layer = normalization.Normalization(axis=-1, mean=[1.0], variance=[1.0])
- output = layer(np.array([[1., 2.]]))
- expected_output = [[0., 1.]]
- self.assertAllClose(output, expected_output)
- self.assertAllClose(layer.get_weights(), [])
-
- def test_broadcasting_during_direct_setting_with_tensors(self):
- if not context.executing_eagerly():
- self.skipTest("Only supported in TF2.")
-
- layer = normalization.Normalization(
- axis=-1,
- mean=constant_op.constant([1.0]),
- variance=constant_op.constant([1.0]))
- output = layer(np.array([[1., 2.]]))
- expected_output = [[0., 1.]]
- self.assertAllClose(output, expected_output)
- self.assertAllClose(layer.get_weights(), [])
-
- def test_broadcasting_during_direct_setting_with_variables_fails(self):
- with self.assertRaisesRegex(ValueError, "passing a Variable"):
- _ = normalization.Normalization(
- axis=-1,
- mean=variables.Variable([1.0]),
- variance=variables.Variable([2.0]))
-
- @parameterized.parameters(
- {"axis": 0},
- {"axis": (-1, 0)},
- )
- def test_zeros_fail_init(self, axis):
- with self.assertRaisesRegex(ValueError,
- "The argument 'axis' may not be 0."):
- normalization.Normalization(axis=axis)
-
- @parameterized.parameters(
- # Out of bounds
- {"axis": 3},
- {"axis": -3},
- # In a tuple
- {"axis": (1, 3)},
- {"axis": (1, -3)},
- )
- def test_bad_axis_fail_build(self, axis):
- layer = normalization.Normalization(axis=axis)
- with self.assertRaisesRegex(ValueError, r"in the range"):
- layer.build([None, 2, 3])
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class NormalizationAdaptTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_layer_api_compatibility(self):
- cls = normalization.Normalization
- with CustomObjectScope({"Normalization": cls}):
- output_data = testing_utils.layer_test(
- cls,
- kwargs={"axis": -1},
- input_shape=(None, 3),
- input_data=np.array([[3, 1, 2], [6, 5, 4]], dtype=np.float32),
- validate_training=False,
- adapt_data=np.array([[1, 2, 1], [2, 3, 4], [1, 2, 1], [2, 3, 4]]))
- expected = np.array([[3., -3., -0.33333333], [9., 5., 1.]])
- self.assertAllClose(expected, output_data)
-
- @parameterized.named_parameters(*_get_layer_computation_test_cases())
- def test_layer_computation(self, adapt_data, axis, test_data, use_dataset,
- expected):
- input_shape = tuple([test_data.shape[i] for i in range(1, test_data.ndim)])
- if use_dataset:
- # Keras APIs expect batched datasets
- adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
- test_data.shape[0] // 2)
- test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
- test_data.shape[0] // 2)
-
- layer = normalization.Normalization(axis=axis)
- layer.adapt(adapt_data)
-
- input_data = keras.Input(shape=input_shape)
- output = layer(input_data)
- model = keras.Model(input_data, output)
- model._run_eagerly = testing_utils.should_run_eagerly()
- output_data = model.predict(test_data)
- self.assertAllClose(expected, output_data)
-
- def test_1d_data(self):
- data = [0, 2, 0, 2]
- layer = normalization.Normalization(axis=-1)
- layer.adapt(data)
- output = layer(data)
- self.assertListEqual(output.shape.as_list(), [4, 1])
- if context.executing_eagerly():
- self.assertAllClose(output.numpy(), [[-1], [1], [-1], [1]])
-
- def test_0d_data(self):
- if not context.executing_eagerly():
- self.skipTest("Only supported in TF2.")
-
- data = [0, 2, 0, 2]
- layer = normalization.Normalization(axis=-1)
- layer.adapt(data)
- output = layer(0.)
- self.assertListEqual(output.shape.as_list(), [1, 1])
- self.assertAllClose(output.numpy(), [[-1]])
-
- @parameterized.parameters(
- # Results should be identical no matter how the axes are specified (3d).
- {"axis": (1, 2)},
- {"axis": (2, 1)},
- {"axis": (1, -1)},
- {"axis": (-1, 1)},
- )
- def test_axis_permutations(self, axis):
- layer = normalization.Normalization(axis=axis)
- # data.shape = [2, 2, 3]
- data = np.array([[[0., 1., 2.], [0., 2., 6.]],
- [[2., 3., 4.], [3., 6., 10.]]])
- expect = np.array([[[-1., -1., -1.], [-1., -1., -1.]],
- [[1., 1., 1.], [1., 1., 1.]]])
- layer.adapt(data)
- self.assertAllClose(expect, layer(data))
-
- def test_model_summary_after_layer_adapt(self):
- data = np.array([[[0., 1., 2.], [0., 2., 6.]],
- [[2., 3., 4.], [3., 6., 10.]]])
- layer = normalization.Normalization(axis=-1)
- layer.adapt(data)
- model = keras.Sequential(
- [layer,
- keras.layers.Dense(64, activation="relu"),
- keras.layers.Dense(1)])
- model.summary()
-
- def test_merge_state(self):
- data = np.random.rand(30, 10, 2)
- ds = dataset_ops.Dataset.from_tensor_slices(data).batch(2)
- norm = normalization.Normalization(axis=(1, 2))
- norm.adapt(ds)
-
- partial_ds_1 = ds.shard(3, 0)
- partial_ds_2 = ds.shard(3, 1)
- partial_ds_3 = ds.shard(3, 2)
-
- norm_1 = normalization.Normalization(axis=(1, 2))
- norm_2 = normalization.Normalization(axis=(1, 2))
- norm_3 = normalization.Normalization(axis=(1, 2))
-
- norm_1.adapt(partial_ds_1)
- norm_2.adapt(partial_ds_2)
- norm_3.adapt(partial_ds_3)
-
- norm_1.merge_state([norm_2, norm_3])
- merged_norm = norm_1
-
- self.assertAllClose(norm(data), merged_norm(data))
-
- def test_multiple_adapts(self):
- first_adapt = [[0], [2], [0], [2]]
- second_adapt = [[2], [4], [2], [4]]
- predict_input = [[2], [2]]
- expected_first_output = [[1], [1]]
- expected_second_output = [[-1], [-1]]
-
- inputs = keras.Input(shape=(1,), dtype=dtypes.int32)
- layer = normalization.Normalization(axis=-1)
- layer.adapt(first_adapt)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- actual_output = model.predict(predict_input)
- self.assertAllClose(actual_output, expected_first_output)
-
- # Re-adapt the layer on new inputs.
- layer.adapt(second_adapt)
- # Re-compile the model.
- model.compile()
- # `predict` should now use the new model state.
- actual_output = model.predict(predict_input)
- self.assertAllClose(actual_output, expected_second_output)
-
- @parameterized.parameters(
- {"adapted": True},
- {"adapted": False},
- )
- def test_saved_model_tf(self, adapted):
- input_data = [[0.], [2.], [0.], [2.]]
- expected_output = [[-1.], [1.], [-1.], [1.]]
-
- inputs = keras.Input(shape=(1,), dtype=dtypes.float32)
- if adapted:
- layer = normalization.Normalization(axis=-1)
- layer.adapt(input_data)
- else:
- layer = normalization.Normalization(mean=1., variance=1.)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- output_data = model.predict(input_data)
- self.assertAllClose(output_data, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_saved_model")
- save.save(model, output_path)
- loaded_model = load.load(output_path)
- f = loaded_model.signatures["serving_default"]
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_data = f(constant_op.constant(input_data))["normalization"]
- self.assertAllClose(new_output_data, expected_output)
-
- @parameterized.parameters(
- {"adapted": True},
- {"adapted": False},
- )
- def test_saved_model_keras(self, adapted):
- input_data = [[0.], [2.], [0.], [2.]]
- expected_output = [[-1.], [1.], [-1.], [1.]]
-
- cls = normalization.Normalization
- inputs = keras.Input(shape=(1,), dtype=dtypes.float32)
- if adapted:
- layer = cls(axis=-1)
- layer.adapt(input_data)
- else:
- layer = cls(mean=1., variance=1.)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- output_data = model.predict(input_data)
- self.assertAllClose(output_data, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
- loaded_model = keras.models.load_model(
- output_path, custom_objects={"Normalization": cls})
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_data = loaded_model.predict(input_data)
- self.assertAllClose(new_output_data, expected_output)
-
- @parameterized.parameters(
- {"adapted": True},
- {"adapted": False},
- )
- def test_saved_weights_keras(self, adapted):
- input_data = [[0.], [2.], [0.], [2.]]
- expected_output = [[-1.], [1.], [-1.], [1.]]
-
- cls = normalization.Normalization
- inputs = keras.Input(shape=(1,), dtype=dtypes.float32)
- if adapted:
- layer = cls(axis=-1)
- layer.adapt(input_data)
- else:
- layer = cls(mean=1., variance=1.)
- outputs = layer(inputs)
- model = keras.Model(inputs=inputs, outputs=outputs)
-
- output_data = model.predict(input_data)
- self.assertAllClose(output_data, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_weights")
- model.save_weights(output_path, save_format="tf")
- new_model = keras.Model.from_config(
- model.get_config(), custom_objects={"Normalization": cls})
- new_model.load_weights(output_path)
-
- # Validate correctness of the new model.
- new_output_data = new_model.predict(input_data)
- self.assertAllClose(new_output_data, expected_output)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_tpu_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_tpu_test.py
deleted file mode 100644
index 50684fe..0000000
--- a/tensorflow/python/keras/layers/preprocessing/normalization_tpu_test.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for keras.layers.preprocessing.normalization."""
-
-from absl.testing import parameterized
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import tpu_strategy_test_utils
-from tensorflow.python.keras.layers.preprocessing import normalization
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.platform import test
-
-
-def _get_layer_computation_test_cases():
- test_cases = ({
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
- "axis": -1,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element"
- }, {
- "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
- "axis": None,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element_none_axis"
- }, {
- "adapt_data": np.array([[1., 2., 3., 4., 5.]], dtype=np.float32),
- "axis": None,
- "test_data": np.array([[1.], [2.], [3.]], np.float32),
- "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
- "testcase_name": "2d_single_element_none_axis_flat_data"
- }, {
- "adapt_data":
- np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
- np.float32),
- "axis":
- 1,
- "test_data":
- np.array([[[1., 2., 3.], [2., 3., 4.]], [[3., 4., 5.], [4., 5., 6.]]],
- np.float32),
- "expected":
- np.array([[[-1.549193, -0.774597, 0.], [-1.549193, -0.774597, 0.]],
- [[0., 0.774597, 1.549193], [0., 0.774597, 1.549193]]],
- np.float32),
- "testcase_name":
- "3d_internal_axis"
- }, {
- "adapt_data":
- np.array(
- [[[1., 0., 3.], [2., 3., 4.]], [[3., -1., 5.], [4., 5., 8.]]],
- np.float32),
- "axis": (1, 2),
- "test_data":
- np.array(
- [[[3., 1., -1.], [2., 5., 4.]], [[3., 0., 5.], [2., 5., 8.]]],
- np.float32),
- "expected":
- np.array(
- [[[1., 3., -5.], [-1., 1., -1.]], [[1., 1., 1.], [-1., 1., 1.]]],
- np.float32),
- "testcase_name":
- "3d_multiple_axis"
- })
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@keras_parameterized.run_all_keras_modes(
- always_skip_v1=True, always_skip_eager=True)
-class NormalizationTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters(*_get_layer_computation_test_cases())
- def test_layer_computation(self, adapt_data, axis, test_data, use_dataset,
- expected):
- input_shape = tuple([None for _ in range(test_data.ndim - 1)])
- if use_dataset:
- # Keras APIs expect batched datasets
- adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
- test_data.shape[0] // 2)
- test_data = dataset_ops.Dataset.from_tensor_slices(test_data).batch(
- test_data.shape[0] // 2)
-
- strategy = tpu_strategy_test_utils.get_tpu_strategy()
-
- with strategy.scope():
- input_data = keras.Input(shape=input_shape)
- layer = normalization.Normalization(axis=axis)
- layer.adapt(adapt_data)
- output = layer(input_data)
- model = keras.Model(input_data, output)
- output_data = model.predict(test_data)
- self.assertAllClose(expected, output_data)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py
deleted file mode 100644
index 525d5b4..0000000
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Preprocessing stage."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.keras.engine import functional
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.util import nest
-
-
-# Sequential methods should take precedence.
-class PreprocessingStage(sequential.Sequential,
- base_preprocessing_layer.PreprocessingLayer):
- """A sequential preprocessing stage.
-
- This preprocessing stage wraps a list of preprocessing layers into a
- Sequential-like object that enables you to `adapt()` the whole list via
- a single `adapt()` call on the preprocessing stage.
-
- Args:
- layers: List of layers. Can include layers that aren't preprocessing layers.
- name: String. Optional name for the preprocessing stage object.
- """
-
- def adapt(self, data, reset_state=True):
- """Adapt the state of the layers of the preprocessing stage to the data.
-
- Args:
- data: A batched Dataset object, or a NumPy array, or an EagerTensor.
- Data to be iterated over to adapt the state of the layers in this
- preprocessing stage.
- reset_state: Whether this call to `adapt` should reset the state of
- the layers in this preprocessing stage.
- """
- if not isinstance(data,
- (dataset_ops.DatasetV2, np.ndarray, ops.EagerTensor)):
- raise ValueError(
- '`adapt()` requires a batched Dataset, an EagerTensor, '
- 'or a Numpy array as input, '
- 'got {}'.format(type(data)))
- if isinstance(data, dataset_ops.DatasetV2):
- # Validate the datasets to try and ensure we haven't been passed one with
- # infinite size. That would cause an infinite loop here.
- if tf_utils.dataset_is_infinite(data):
- raise ValueError(
- 'The dataset passed to `adapt()` has an infinite number of '
- 'elements. Please use dataset.take(...) to make the number '
- 'of elements finite.')
-
- for current_layer_index in range(0, len(self.layers)):
- if not hasattr(self.layers[current_layer_index], 'adapt'):
- # Skip any layer that does not need adapting.
- continue
-
- def map_fn(x):
- """Maps `PreprocessingStage` inputs to inputs at `current_layer_index`.
-
- Args:
- x: Batch of inputs seen in entry of the `PreprocessingStage` instance.
-
- Returns:
- Batch of inputs to be processed by layer
- `self.layers[current_layer_index]`
- """
- if current_layer_index == 0: # pylint: disable=cell-var-from-loop
- return x
- for i in range(current_layer_index): # pylint: disable=cell-var-from-loop
- x = self.layers[i](x)
- return x
-
- if isinstance(data, dataset_ops.DatasetV2):
- current_layer_data = data.map(map_fn)
- else:
- current_layer_data = map_fn(data)
- self.layers[current_layer_index].adapt(current_layer_data,
- reset_state=reset_state)
-
-
-# Functional methods shoud take precedence.
-class FunctionalPreprocessingStage(functional.Functional,
- base_preprocessing_layer.PreprocessingLayer):
- """A functional preprocessing stage.
-
- This preprocessing stage wraps a graph of preprocessing layers into a
- Functional-like object that enables you to `adapt()` the whole graph via
- a single `adapt()` call on the preprocessing stage.
-
- Preprocessing stage is not a complete model, so it cannot be called with
- `fit()`. However, it is possible to add regular layers that may be trainable
- to a preprocessing stage.
-
- A functional preprocessing stage is created in the same way as `Functional`
- models. A stage can be instantiated by passing two arguments to
- `__init__`. The first argument is the `keras.Input` Tensors that represent
- the inputs to the stage. The second argument specifies the output
- tensors that represent the outputs of this stage. Both arguments can be a
- nested structure of tensors.
-
- Example:
-
- >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
- ... 'x1': tf.keras.Input(shape=(1,))}
- >>> norm_layer = tf.keras.layers.experimental.preprocessing.Normalization()
- >>> y = norm_layer(inputs['x2'])
- >>> y, z = tf.keras.layers.Lambda(lambda x: (x, x))(inputs['x1'])
- >>> outputs = [inputs['x1'], [y, z]]
- >>> stage = FunctionalPreprocessingStage(inputs, outputs)
-
- Args:
- inputs: An input tensor (must be created via `tf.keras.Input()`), or a list,
- a dict, or a nested strcture of input tensors.
- outputs: An output tensor, or a list, a dict or a nested structure of output
- tensors.
- name: String, optional. Name of the preprocessing stage.
- """
-
- def fit(self, *args, **kwargs):
- raise ValueError(
- 'Preprocessing stage is not a complete model, and hence should not be '
- '`fit`. Instead, you may feed data to `adapt` the stage to set '
- 'appropriate states of the layers in the stage.')
-
- def adapt(self, data, reset_state=True):
- """Adapt the state of the layers of the preprocessing stage to the data.
-
- Args:
- data: A batched Dataset object, a NumPy array, an EagerTensor, or a list,
- dict or nested structure of Numpy Arrays or EagerTensors. The elements
- of Dataset object need to conform with inputs of the stage. The first
- dimension of NumPy arrays or EagerTensors are understood to be batch
- dimension. Data to be iterated over to adapt the state of the layers in
- this preprocessing stage.
- reset_state: Whether this call to `adapt` should reset the state of the
- layers in this preprocessing stage.
-
- Examples:
-
- >>> # For a stage with dict input
- >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
- ... 'x1': tf.keras.Input(shape=(1,))}
- >>> outputs = [inputs['x1'], inputs['x2']]
- >>> stage = FunctionalPreprocessingStage(inputs, outputs)
- >>> ds = tf.data.Dataset.from_tensor_slices({'x1': tf.ones((4,5)),
- ... 'x2': tf.ones((4,1))})
- >>> sorted(ds.element_spec.items()) # Check element_spec
- [('x1', TensorSpec(shape=(5,), dtype=tf.float32, name=None)),
- ('x2', TensorSpec(shape=(1,), dtype=tf.float32, name=None))]
- >>> stage.adapt(ds)
- >>> data_np = {'x1': np.ones((4, 5)), 'x2': np.ones((4, 1))}
- >>> stage.adapt(data_np)
-
- """
- if not isinstance(data, dataset_ops.Dataset):
- data = self._flatten_to_reference_inputs(data)
- if any(not isinstance(datum, (np.ndarray, ops.EagerTensor))
- for datum in data):
- raise ValueError(
- '`adapt()` requires a batched Dataset, a list of EagerTensors '
- 'or Numpy arrays as input, got {}'.format(type(data)))
- ds_input = [
- dataset_ops.Dataset.from_tensor_slices(x).batch(1) for x in data
- ]
-
- if isinstance(data, dataset_ops.Dataset):
- # Validate the datasets to try and ensure we haven't been passed one with
- # infinite size. That would cause an infinite loop here.
- if tf_utils.dataset_is_infinite(data):
- raise ValueError(
- 'The dataset passed to `adapt()` has an infinite number of '
- 'elements. Please use dataset.take(...) to make the number '
- 'of elements finite.')
- # Unzip dataset object to a list of single input dataset.
- ds_input = _unzip_dataset(data)
-
- # Dictionary mapping reference tensors to datasets
- ds_dict = {}
- tensor_usage_count = self._tensor_usage_count
- for x, y in zip(self.inputs, ds_input):
- x_id = str(id(x))
- ds_dict[x_id] = [y] * tensor_usage_count[x_id]
-
- nodes_by_depth = self._nodes_by_depth
- depth_keys = sorted(nodes_by_depth.keys(), reverse=True)
-
- def build_map_fn(node, args, kwargs):
- if not isinstance(args.element_spec, tuple):
-
- def map_fn(*x):
- return nest.flatten(node.layer(*x, **kwargs))
- else:
-
- def map_fn(*x):
- return nest.flatten(node.layer(x, **kwargs))
-
- return map_fn
-
- for depth in depth_keys:
- for node in nodes_by_depth[depth]:
- # Input node
- if node.is_input:
- continue
-
- # Node with input not computed yet
- if any(t_id not in ds_dict for t_id in node.flat_input_ids):
- continue
-
- args, kwargs = node.map_arguments(ds_dict)
- args = dataset_ops.Dataset.zip(nest.list_to_tuple(*args))
-
- if node.layer.stateful and hasattr(node.layer, 'adapt'):
- node.layer.adapt(args, reset_state=reset_state)
-
- map_fn = build_map_fn(node, args, kwargs)
- outputs = args.map(map_fn)
- outputs = _unzip_dataset(outputs)
-
- # Update ds_dict.
- for x_id, y in zip(node.flat_output_ids, outputs):
- ds_dict[x_id] = [y] * tensor_usage_count[x_id]
-
-
-def _unzip_dataset(ds):
- """Unzip dataset into a list of single element datasets.
-
- Args:
- ds: A Dataset object.
-
- Returns:
- A list of Dataset object, each correspond to one of the `element_spec` of
- the input Dataset object.
-
- Example:
-
- >>> ds1 = tf.data.Dataset.from_tensor_slices([1, 2, 3])
- >>> ds2 = tf.data.Dataset.from_tensor_slices([4, 5, 6])
- >>> ds_zipped_tuple = tf.data.Dataset.zip((ds1, ds2))
- >>> ds_unzipped_tuple = _unzip_dataset(ds_zipped_tuple)
- >>> ds_zipped_dict = tf.data.Dataset.zip({'ds1': ds1, 'ds2': ds2})
- >>> ds_unzipped_dict = _unzip_dataset(ds_zipped_dict)
-
- Then the two elements of `ds_unzipped_tuple` and `ds_unzipped_dict` are both
- the same as `ds1` and `ds2`.
- """
- element_count = len(nest.flatten(ds.element_spec))
- ds_unzipped = []
- for i in range(element_count):
-
- def map_fn(*x, j=i):
- return nest.flatten(x)[j]
-
- ds_unzipped.append(ds.map(map_fn))
- return ds_unzipped
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_functional_test.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_functional_test.py
deleted file mode 100644
index cfff5d0..0000000
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_functional_test.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional preprocessing stage tests."""
-# pylint: disable=g-classes-have-attributes
-
-import time
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.keras.engine.input_layer import Input
-from tensorflow.python.keras.layers import convolutional
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers import merge
-from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.keras.layers.preprocessing import normalization
-from tensorflow.python.keras.layers.preprocessing import preprocessing_stage
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class PL(base_preprocessing_layer.PreprocessingLayer):
-
- def __init__(self, **kwargs):
- self.adapt_time = None
- self.adapt_count = 0
- super(PL, self).__init__(**kwargs)
-
- def adapt(self, data, reset_state=True):
- self.adapt_time = time.time()
- self.adapt_count += 1
-
- def call(self, inputs):
- return inputs + 1
-
-
-class PLMerge(PL):
-
- def call(self, inputs):
- return inputs[0] + inputs[1]
-
-
-class PLSplit(PL):
-
- def call(self, inputs):
- return inputs + 1, inputs - 1
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class PreprocessingStageTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_adapt_preprocessing_stage_with_single_input_output(self):
-
- x = Input(shape=(3,))
-
- l0 = PL()
- y = l0(x)
-
- l1 = PL()
- z = l1(y)
-
- stage = preprocessing_stage.FunctionalPreprocessingStage(x, z)
- stage.compile()
-
- # Test with NumPy array
- one_array = np.ones((4, 3), dtype='float32')
- stage.adapt(one_array)
- self.assertEqual(l0.adapt_count, 1)
- self.assertEqual(l1.adapt_count, 1)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
- # Check call
- z = stage(array_ops.ones((4, 3), dtype='float32'))
- self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 2.)
-
- # Test with dataset
- adapt_data = dataset_ops.Dataset.from_tensor_slices(one_array)
- adapt_data = adapt_data.batch(2) # 5 batches of 2 samples
-
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 2)
- self.assertEqual(l1.adapt_count, 2)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
- # Test error with bad data
- with self.assertRaisesRegex(ValueError, 'requires a '):
- stage.adapt(None)
-
- # Disallow calling fit
- with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
- stage.fit(None)
-
- def test_adapt_preprocessing_stage_with_list_input(self):
-
- x0 = Input(shape=(3,))
- x1 = Input(shape=(3,))
- x2 = Input(shape=(3,))
-
- l0 = PLMerge()
- y = l0([x0, x1])
-
- l1 = PLMerge()
- y = l1([y, x2])
-
- l2 = PLSplit()
- z, y = l2(y)
-
- stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2],
- [y, z])
- stage.compile()
-
- # Test with NumPy array
- one_array = np.ones((4, 3), dtype='float32')
- stage.adapt([one_array, one_array, one_array])
- self.assertEqual(l0.adapt_count, 1)
- self.assertEqual(l1.adapt_count, 1)
- self.assertEqual(l2.adapt_count, 1)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Check call
- y, z = stage([
- array_ops.ones((4, 3), dtype='float32'),
- array_ops.ones((4, 3), dtype='float32'),
- array_ops.ones((4, 3), dtype='float32')
- ])
- self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 1.)
- self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 3.)
-
- # Test with dataset
- adapt_data = dataset_ops.Dataset.from_tensor_slices(
- (one_array, one_array, one_array))
- adapt_data = adapt_data.batch(2) # 5 batches of 2 samples
-
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 2)
- self.assertEqual(l1.adapt_count, 2)
- self.assertEqual(l2.adapt_count, 2)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Test error with bad data
- with self.assertRaisesRegex(ValueError, 'requires a '):
- stage.adapt(None)
-
- def test_adapt_preprocessing_stage_with_dict_input(self):
- x0 = Input(shape=(3,), name='x0')
- x1 = Input(shape=(4,), name='x1')
- x2 = Input(shape=(3, 5), name='x2')
-
- # dimension will mismatch if x1 incorrectly placed.
- x1_sum = core.Lambda(
- lambda x: math_ops.reduce_sum(x, axis=-1, keepdims=True))(
- x1)
- x2_sum = core.Lambda(lambda x: math_ops.reduce_sum(x, axis=-1))(x2)
-
- l0 = PLMerge()
- y = l0([x0, x1_sum])
-
- l1 = PLMerge()
- y = l1([y, x2_sum])
-
- l2 = PLSplit()
- z, y = l2(y)
- stage = preprocessing_stage.FunctionalPreprocessingStage(
- {
- 'x2': x2,
- 'x0': x0,
- 'x1': x1
- }, [y, z])
- stage.compile()
-
- # Test with dict of NumPy array
- one_array0 = np.ones((4, 3), dtype='float32')
- one_array1 = np.ones((4, 4), dtype='float32')
- one_array2 = np.ones((4, 3, 5), dtype='float32')
- adapt_data = {'x1': one_array1, 'x0': one_array0, 'x2': one_array2}
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 1)
- self.assertEqual(l1.adapt_count, 1)
- self.assertEqual(l2.adapt_count, 1)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Check call
- y, z = stage({
- 'x1': array_ops.constant(one_array1),
- 'x2': array_ops.constant(one_array2),
- 'x0': array_ops.constant(one_array0)
- })
- self.assertAllClose(y, np.zeros((4, 3), dtype='float32') + 9.)
- self.assertAllClose(z, np.zeros((4, 3), dtype='float32') + 11.)
-
- # Test with list of NumPy array
- adapt_data = [one_array0, one_array1, one_array2]
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 2)
- self.assertEqual(l1.adapt_count, 2)
- self.assertEqual(l2.adapt_count, 2)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Test with flattened dataset
- adapt_data = dataset_ops.Dataset.from_tensor_slices(
- (one_array0, one_array1, one_array2))
- adapt_data = adapt_data.batch(2) # 5 batches of 2 samples
-
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 3)
- self.assertEqual(l1.adapt_count, 3)
- self.assertEqual(l2.adapt_count, 3)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Test with dataset in dict shape
- adapt_data = dataset_ops.Dataset.from_tensor_slices({
- 'x0': one_array0,
- 'x2': one_array2,
- 'x1': one_array1
- })
- adapt_data = adapt_data.batch(2) # 5 batches of 2 samples
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 4)
- self.assertEqual(l1.adapt_count, 4)
- self.assertEqual(l2.adapt_count, 4)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Test error with bad data
- with self.assertRaisesRegex(ValueError, 'requires a '):
- stage.adapt(None)
-
- def test_adapt_preprocessing_stage_with_dict_output(self):
- x = Input(shape=(3,), name='x')
-
- l0 = PLSplit()
- y0, y1 = l0(x)
-
- l1 = PLSplit()
- z0, z1 = l1(y0)
- stage = preprocessing_stage.FunctionalPreprocessingStage({'x': x}, {
- 'y1': y1,
- 'z1': z1,
- 'y0': y0,
- 'z0': z0
- })
- stage.compile()
-
- # Test with NumPy array
- one_array = np.ones((4, 3), dtype='float32')
- adapt_data = {'x': one_array}
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 1)
- self.assertEqual(l1.adapt_count, 1)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
- # Check call
- outputs = stage({'x': array_ops.constant(one_array)})
- self.assertEqual(set(outputs.keys()), {'y0', 'y1', 'z0', 'z1'})
- self.assertAllClose(outputs['y0'], np.ones((4, 3), dtype='float32') + 1.)
- self.assertAllClose(outputs['y1'], np.ones((4, 3), dtype='float32') - 1.)
- self.assertAllClose(outputs['z0'], np.ones((4, 3), dtype='float32') + 2.)
- self.assertAllClose(outputs['z1'], np.ones((4, 3), dtype='float32'))
-
- def test_preprocessing_stage_with_nested_input(self):
- # Test with NumPy array
- x0 = Input(shape=(3,))
- x1 = Input(shape=(3,))
- x2 = Input(shape=(3,))
-
- l0 = PLMerge()
- y = l0([x0, x1])
-
- l1 = PLMerge()
- y = l1([y, x2])
-
- l2 = PLSplit()
- z, y = l2(y)
-
- stage = preprocessing_stage.FunctionalPreprocessingStage([x0, [x1, x2]],
- [y, z])
- stage.compile()
- one_array = np.ones((4, 3), dtype='float32')
- stage.adapt([one_array, [one_array, one_array]])
- self.assertEqual(l0.adapt_count, 1)
- self.assertEqual(l1.adapt_count, 1)
- self.assertEqual(l2.adapt_count, 1)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Check call
- y, z = stage([
- array_ops.ones((4, 3), dtype='float32'),
- [
- array_ops.ones((4, 3), dtype='float32'),
- array_ops.ones((4, 3), dtype='float32')
- ]
- ])
- self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 1.)
- self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 3.)
-
- # Test with dataset
- adapt_data = dataset_ops.Dataset.from_tensor_slices(
- (one_array, (one_array, one_array)))
- adapt_data = adapt_data.batch(2) # 5 batches of 2 samples
-
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 2)
- self.assertEqual(l1.adapt_count, 2)
- self.assertEqual(l2.adapt_count, 2)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
- self.assertLessEqual(l1.adapt_time, l2.adapt_time)
-
- # Test error with bad data
- with self.assertRaisesRegex(ValueError, 'requires a '):
- stage.adapt(None)
-
- def test_include_layers_with_dict_input(self):
-
- class PLMergeDict(PLMerge):
-
- def call(self, inputs):
- return inputs['a'] + inputs['b']
-
- x0 = Input(shape=(3,))
- x1 = Input(shape=(3,))
-
- l0 = PLMergeDict()
- y = l0({'a': x0, 'b': x1})
-
- l1 = PLSplit()
- z, y = l1(y)
-
- stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1], [y, z])
- stage.compile()
-
- one_array = np.ones((4, 3), dtype='float32')
- adapt_data = dataset_ops.Dataset.from_tensor_slices((one_array, one_array))
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 1)
- self.assertEqual(l1.adapt_count, 1)
- self.assertLessEqual(l0.adapt_time, l1.adapt_time)
-
- # Check call
- y, z = stage([
- array_ops.ones((4, 3), dtype='float32'),
- array_ops.ones((4, 3), dtype='float32')
- ])
- self.assertAllClose(y, np.ones((4, 3), dtype='float32'))
- self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 2.)
-
- def test_include_layers_with_nested_input(self):
-
- class PLMergeNest(PLMerge):
-
- def call(self, inputs):
- a = inputs[0]
- b = inputs[1][0]
- c = inputs[1][1]
- return a + b + c
-
- x0 = Input(shape=(3,))
- x1 = Input(shape=(3,))
- x2 = Input(shape=(3,))
-
- l0 = PLMergeNest()
- y = l0([x0, [x1, x2]])
-
- stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2], y)
- stage.compile()
-
- one_array = np.ones((4, 3), dtype='float32')
- adapt_data = dataset_ops.Dataset.from_tensor_slices((one_array,) * 3)
- stage.adapt(adapt_data)
- self.assertEqual(l0.adapt_count, 1)
-
- # Check call
- y = stage([
- array_ops.ones((4, 3), dtype='float32'),
- array_ops.ones((4, 3), dtype='float32'),
- array_ops.ones((4, 3), dtype='float32')
- ])
- self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 2.)
-
- def test_mixing_preprocessing_and_regular_layers(self):
- x0 = Input(shape=(10, 10, 3))
- x1 = Input(shape=(10, 10, 3))
- x2 = Input(shape=(10, 10, 3))
-
- y0 = merge.Add()([x0, x1])
- y1 = image_preprocessing.CenterCrop(8, 8)(x2)
- y1 = convolutional.ZeroPadding2D(padding=1)(y1)
-
- z = merge.Add()([y0, y1])
- z = normalization.Normalization()(z)
- z = convolutional.Conv2D(4, 3)(z)
-
- stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2], z)
-
- data = [
- np.ones((12, 10, 10, 3), dtype='float32'),
- np.ones((12, 10, 10, 3), dtype='float32'),
- np.ones((12, 10, 10, 3), dtype='float32')
- ]
-
- stage.adapt(data)
- _ = stage(data)
- stage.compile('rmsprop', 'mse')
- with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
- stage.fit(data, np.ones((12, 8, 8, 4)))
-
- ds_x0 = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
- ds_x1 = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
- ds_x2 = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
- ds_x = dataset_ops.Dataset.zip((ds_x0, ds_x1, ds_x2))
- ds_y = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 8, 8, 4)))
- dataset = dataset_ops.Dataset.zip((ds_x, ds_y)).batch(4)
-
- with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
- stage.fit(dataset)
- _ = stage.evaluate(data, np.ones((12, 8, 8, 4)))
- _ = stage.predict(data)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py
deleted file mode 100644
index 7da6e62..0000000
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Preprocessing stage tests."""
-# pylint: disable=g-classes-have-attributes
-
-import time
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.keras.layers.preprocessing import preprocessing_stage
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class PreprocessingStageTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_adapt(self):
-
- class PL(base_preprocessing_layer.PreprocessingLayer):
-
- def __init__(self, **kwargs):
- self.adapt_time = None
- self.adapt_count = 0
- super(PL, self).__init__(**kwargs)
-
- def adapt(self, data, reset_state=True):
- self.adapt_time = time.time()
- self.adapt_count += 1
-
- def call(self, inputs):
- return inputs + 1.
-
- # Test with NumPy array
- stage = preprocessing_stage.PreprocessingStage([
- PL(),
- PL(),
- PL(),
- ])
- stage.adapt(np.ones((3, 4)))
- self.assertEqual(stage.layers[0].adapt_count, 1)
- self.assertEqual(stage.layers[1].adapt_count, 1)
- self.assertEqual(stage.layers[2].adapt_count, 1)
- self.assertLessEqual(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
- self.assertLessEqual(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
-
- # Check call
- y = stage(array_ops.ones((3, 4)))
- self.assertAllClose(y, np.ones((3, 4)) + 3.)
-
- # Test with dataset
- adapt_data = dataset_ops.Dataset.from_tensor_slices(np.ones((3, 10)))
- adapt_data = adapt_data.batch(2) # 5 batches of 2 samples
-
- stage.adapt(adapt_data)
- self.assertEqual(stage.layers[0].adapt_count, 2)
- self.assertEqual(stage.layers[1].adapt_count, 2)
- self.assertEqual(stage.layers[2].adapt_count, 2)
- self.assertLess(stage.layers[0].adapt_time, stage.layers[1].adapt_time)
- self.assertLess(stage.layers[1].adapt_time, stage.layers[2].adapt_time)
-
- # Test error with bad data
- with self.assertRaisesRegex(ValueError, 'requires a '):
- stage.adapt(None)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
deleted file mode 100644
index 86278d3..0000000
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_test_utils.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras' base preprocessing layer."""
-
-import collections
-import numpy as np
-
-from tensorflow.python.platform import test
-
-
-class PreprocessingLayerTest(test.TestCase):
- """Base test class for preprocessing layer API validation."""
- # TODO(b/137303934): Consider incorporating something like this Close vs All
- # behavior into core tf.test.TestCase.
-
- def assertAllCloseOrEqual(self, a, b, msg=None):
- """Asserts that elements are close (if numeric) or equal (if string)."""
- if a is None or b is None:
- self.assertAllEqual(a, b, msg=msg)
- elif isinstance(a, (list, tuple)):
- self.assertEqual(len(a), len(b))
- for a_value, b_value in zip(a, b):
- self.assertAllCloseOrEqual(a_value, b_value, msg=msg)
- elif isinstance(a, collections.abc.Mapping):
- self.assertEqual(len(a), len(b))
- for key, a_value in a.items():
- b_value = b[key]
- error_message = "{} ({})".format(msg, key) if msg else None
- self.assertAllCloseOrEqual(a_value, b_value, error_message)
- elif (isinstance(a, float) or
- hasattr(a, "dtype") and np.issubdtype(a.dtype, np.number)):
- self.assertAllClose(a, b, msg=msg)
- else:
- self.assertAllEqual(a, b, msg=msg)
-
- def assert_extracted_output_equal(self, combiner, acc1, acc2, msg=None):
- data_1 = combiner.extract(acc1)
- data_2 = combiner.extract(acc2)
- self.assertAllCloseOrEqual(data_1, data_2, msg=msg)
-
- # This is an injection seam so that tests like TextVectorizationTest can
- # define their own methods for asserting that accumulators are equal.
- compare_accumulators = assertAllCloseOrEqual
-
- def validate_accumulator_computation(self, combiner, data, expected):
- """Validate that various combinations of compute and merge are identical."""
- if len(data) < 4:
- raise AssertionError("Data must have at least 4 elements.")
- data_0 = np.array([data[0]])
- data_1 = np.array([data[1]])
- data_2 = np.array(data[2:])
-
- single_compute = combiner.compute(data)
-
- all_merge = combiner.merge([
- combiner.compute(data_0),
- combiner.compute(data_1),
- combiner.compute(data_2)
- ])
-
- self.compare_accumulators(
- single_compute,
- all_merge,
- msg="Sharding data should not change the data output.")
-
- unordered_all_merge = combiner.merge([
- combiner.compute(data_1),
- combiner.compute(data_2),
- combiner.compute(data_0)
- ])
- self.compare_accumulators(
- all_merge,
- unordered_all_merge,
- msg="The order of merge arguments should not change the data "
- "output.")
-
- hierarchical_merge = combiner.merge([
- combiner.compute(data_1),
- combiner.merge([combiner.compute(data_2),
- combiner.compute(data_0)])
- ])
- self.compare_accumulators(
- all_merge,
- hierarchical_merge,
- msg="Nesting merge arguments should not change the data output.")
-
- nested_compute = combiner.compute(
- data_0, combiner.compute(data_1, combiner.compute(data_2)))
- self.compare_accumulators(
- all_merge,
- nested_compute,
- msg="Nesting compute arguments should not change the data output.")
-
- mixed_compute = combiner.merge([
- combiner.compute(data_0),
- combiner.compute(data_1, combiner.compute(data_2))
- ])
- self.compare_accumulators(
- all_merge,
- mixed_compute,
- msg="Mixing merge and compute calls should not change the data "
- "output.")
-
- single_merge = combiner.merge([
- combiner.merge([combiner.compute(data_0)]),
- combiner.compute(data_1, combiner.compute(data_2))
- ])
- self.compare_accumulators(
- all_merge,
- single_merge,
- msg="Calling merge with a data length of 1 should not change the data "
- "output.")
-
- self.compare_accumulators(
- expected,
- all_merge,
- msg="Calculated accumulators "
- "did not match expected accumulator.")
-
- def validate_accumulator_extract(self, combiner, data, expected):
- """Validate that the expected results of computing and extracting."""
- acc = combiner.compute(data)
- extracted_data = combiner.extract(acc)
- self.assertAllCloseOrEqual(expected, extracted_data)
-
- def validate_accumulator_extract_and_restore(self, combiner, data, expected):
- """Validate that the extract<->restore loop loses no data."""
- acc = combiner.compute(data)
- extracted_data = combiner.extract(acc)
- restored_acc = combiner.restore(extracted_data)
- self.assert_extracted_output_equal(combiner, acc, restored_acc)
- self.assertAllCloseOrEqual(expected, combiner.extract(restored_acc))
-
- def validate_accumulator_serialize_and_deserialize(self, combiner, data,
- expected):
- """Validate that the serialize<->deserialize loop loses no data."""
- acc = combiner.compute(data)
- serialized_data = combiner.serialize(acc)
- deserialized_data = combiner.deserialize(serialized_data)
- self.compare_accumulators(acc, deserialized_data)
- self.compare_accumulators(expected, deserialized_data)
-
- def validate_accumulator_uniqueness(self, combiner, data):
- """Validate that every call to compute creates a unique accumulator."""
- acc = combiner.compute(data)
- acc2 = combiner.compute(data)
- self.assertIsNot(acc, acc2)
- self.compare_accumulators(acc, acc2)
diff --git a/tensorflow/python/keras/layers/preprocessing/reduction.py b/tensorflow/python/keras/layers/preprocessing/reduction.py
deleted file mode 100644
index 9d8c4f5..0000000
--- a/tensorflow/python/keras/layers/preprocessing/reduction.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras reduction layer."""
-# pylint: disable=g-classes-have-attributes
-
-from tensorflow.python.keras.engine.base_layer import Layer
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-def get_reduce_op(reduction_str):
- """Translate a reduction string name to a reduction op."""
- if reduction_str == "max":
- return math_ops.reduce_max
- elif reduction_str == "mean":
- return math_ops.reduce_mean
- elif reduction_str == "min":
- return math_ops.reduce_min
- elif reduction_str == "prod":
- return math_ops.reduce_prod
- elif reduction_str == "sum":
- return math_ops.reduce_sum
- else:
- raise ValueError("Reduction %s is not supported for unweighted inputs." %
- reduction_str)
-
-
-class Reduction(Layer):
- """Performs an optionally-weighted reduction.
-
- This layer performs a reduction across one axis of its input data. This
- data may optionally be weighted by passing in an identical float tensor.
-
- Args:
- reduction: The type of reduction to perform. Can be one of the following:
- "max", "mean", "min", "prod", or "sum". This layer uses the Tensorflow
- reduce op which corresponds to that reduction (so, for "mean", we use
- "reduce_mean").
- axis: The axis to reduce along. Defaults to '-2', which is usually the axis
- that contains embeddings (but is not within the embedding itself).
-
- Input shape:
- A tensor of 2 or more dimensions of any numeric dtype.
-
- Output:
- A tensor of 1 less dimension than the input tensor, of the same dtype.
-
- Call arguments:
- inputs: The data to reduce.
- weights: An optional tensor or constant of the same shape as inputs that
- will weight the input data before it is reduced.
- """
- # TODO(momernick): Add example here.
-
- def __init__(self, reduction, axis=-2, **kwargs):
- self.reduction = reduction
- self.axis = axis
- # We temporarily turn off autocasting, as it does not apply to named call
- # kwargs.
- super(Reduction, self).__init__(**kwargs)
-
- def call(self, inputs, weights=None):
- # If we are not weighting the inputs we can immediately reduce the data
- # and return it.
- if weights is None:
- return get_reduce_op(self.reduction)(inputs, axis=self.axis)
-
- # TODO(momernick): Add checks for this and a decent error message if the
- # weight shape isn't compatible.
- if weights.shape.rank + 1 == inputs.shape.rank:
- weights = array_ops.expand_dims(weights, -1)
-
- weighted_inputs = math_ops.multiply(inputs, weights)
-
- # Weighted sum and prod can be expressed as reductions over the weighted
- # values, as can min and max.
- if self.reduction in ("sum", "prod", "min", "max"):
- return get_reduce_op(self.reduction)(weighted_inputs, axis=self.axis)
-
- # Weighted mean is a bit more complicated: we have to do a sum of the
- # weighted values and divide by the sum of the weights.
- if self.reduction == "mean":
- input_sum = math_ops.reduce_sum(weighted_inputs, axis=self.axis)
- weight_sum = math_ops.reduce_sum(weights, axis=self.axis)
- return math_ops.divide(input_sum, weight_sum)
-
- # sqrtn is also more complicated: it's like mean but with a normalized
- # divisor.
- if self.reduction == "sqrtn":
- logging.warning("Reduction `sqrtn` is deprecated and will be removed "
- "2021-01-01. Please use the `sum` reduction and divide "
- "the output by the normalized weights instead.")
- input_sum = math_ops.reduce_sum(weighted_inputs, axis=self.axis)
- squared_weights = math_ops.pow(weights, 2)
- squared_weights_sum = math_ops.reduce_sum(squared_weights, axis=self.axis)
- sqrt_weights_sum = math_ops.sqrt(squared_weights_sum)
- return math_ops.divide(input_sum, sqrt_weights_sum)
-
- raise ValueError("%s is not a supported weighted reduction." %
- self.reduction)
diff --git a/tensorflow/python/keras/layers/preprocessing/reduction_test.py b/tensorflow/python/keras/layers/preprocessing/reduction_test.py
deleted file mode 100644
index dd8bfa8..0000000
--- a/tensorflow/python/keras/layers/preprocessing/reduction_test.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for keras.layers.preprocessing.reduction."""
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import reduction
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import test
-
-
-@keras_parameterized.run_all_keras_modes
-class ReductionTest(keras_parameterized.TestCase):
-
- @parameterized.named_parameters(
- {
- "testcase_name": "max",
- "reduction_str": "max",
- "expected_output": [[3.0, 3.0], [3.0, 2.0]]
- }, {
- "testcase_name": "mean",
- "reduction_str": "mean",
- "expected_output": [[2.0, 2.0], [2.0, 1.5]]
- }, {
- "testcase_name": "min",
- "reduction_str": "min",
- "expected_output": [[1.0, 1.0], [1.0, 1.0]]
- }, {
- "testcase_name": "prod",
- "reduction_str": "prod",
- "expected_output": [[6.0, 6.0], [3.0, 2.0]]
- }, {
- "testcase_name": "sum",
- "reduction_str": "sum",
- "expected_output": [[6.0, 6.0], [4.0, 3.0]]
- })
- def test_unweighted_ragged_reduction(self, reduction_str, expected_output):
- data = ragged_factory_ops.constant([[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
- [[3.0, 1.0], [1.0, 2.0]]])
- input_tensor = keras.Input(shape=(None, None), ragged=True)
-
- output_tensor = reduction.Reduction(reduction=reduction_str)(input_tensor)
- model = keras.Model(input_tensor, output_tensor)
-
- output = model.predict(data)
-
- self.assertAllClose(expected_output, output)
-
- @parameterized.named_parameters(
- {
- "testcase_name": "max",
- "reduction_str": "max",
- "expected_output": [[4.0, 4.0], [1.5, 6.0]]
- }, {
- "testcase_name": "mean",
- "reduction_str": "mean",
- "expected_output": [[2.0, 2.0], [1.666667, 1.75]]
- }, {
- "testcase_name": "min",
- "reduction_str": "min",
- "expected_output": [[1.0, 1.0], [1.0, 1.0]]
- }, {
- "testcase_name": "prod",
- "reduction_str": "prod",
- "expected_output": [[12.0, 12.0], [1.5, 6.0]]
- }, {
- "testcase_name": "sum",
- "reduction_str": "sum",
- "expected_output": [[8.0, 8.0], [2.5, 7.0]]
- }, {
- "testcase_name": "sqrtn",
- "reduction_str": "sqrtn",
- "expected_output": [[3.265986, 3.265986], [2.236067, 2.213594]]
- })
- def test_weighted_ragged_reduction(self, reduction_str, expected_output):
- data = ragged_factory_ops.constant([[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
- [[3.0, 1.0], [1.0, 2.0]]])
- input_tensor = keras.Input(shape=(None, None), ragged=True)
-
- weights = ragged_factory_ops.constant([[[1.0, 1.0], [2.0, 2.0], [1.0, 1.0]],
- [[0.5, 1.0], [1.0, 3.0]]])
- weight_input_tensor = keras.Input(shape=(None, None), ragged=True)
-
- output_tensor = reduction.Reduction(reduction=reduction_str)(
- input_tensor, weights=weight_input_tensor)
- model = keras.Model([input_tensor, weight_input_tensor], output_tensor)
-
- output = model.predict([data, weights])
- self.assertAllClose(expected_output, output)
-
- def test_weighted_ragged_reduction_with_different_dimensionality(self):
- data = ragged_factory_ops.constant([[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
- [[3.0, 1.0], [1.0, 2.0]]])
- input_tensor = keras.Input(shape=(None, None), ragged=True)
-
- weights = ragged_factory_ops.constant([[1.0, 2.0, 1.0], [1.0, 1.0]])
- weight_input_tensor = keras.Input(shape=(None,), ragged=True)
-
- output_tensor = reduction.Reduction(reduction="mean")(
- input_tensor, weights=weight_input_tensor)
- model = keras.Model([input_tensor, weight_input_tensor], output_tensor)
-
- output = model.predict([data, weights])
- expected_output = [[2.0, 2.0], [2.0, 1.5]]
- self.assertAllClose(expected_output, output)
-
- @parameterized.named_parameters(
- {
- "testcase_name": "max",
- "reduction_str": "max",
- "expected_output": [[3.0, 3.0], [3.0, 2.0]]
- }, {
- "testcase_name": "mean",
- "reduction_str": "mean",
- "expected_output": [[2.0, 2.0], [1.333333, 1.0]]
- }, {
- "testcase_name": "min",
- "reduction_str": "min",
- "expected_output": [[1.0, 1.0], [0.0, 0.0]]
- }, {
- "testcase_name": "prod",
- "reduction_str": "prod",
- "expected_output": [[6.0, 6.0], [0.0, 0.0]]
- }, {
- "testcase_name": "sum",
- "reduction_str": "sum",
- "expected_output": [[6.0, 6.0], [4.0, 3.0]]
- })
- def test_unweighted_dense_reduction(self, reduction_str, expected_output):
- data = np.array([[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
- [[3.0, 1.0], [1.0, 2.0], [0.0, 0.0]]])
- input_tensor = keras.Input(shape=(None, None))
-
- output_tensor = reduction.Reduction(reduction=reduction_str)(input_tensor)
- model = keras.Model(input_tensor, output_tensor)
-
- output = model.predict(data)
-
- self.assertAllClose(expected_output, output)
-
- @parameterized.named_parameters(
- {
- "testcase_name": "max",
- "reduction_str": "max",
- "expected_output": [[4.0, 4.0], [1.5, 6.0]]
- }, {
- "testcase_name": "mean",
- "reduction_str": "mean",
- "expected_output": [[2.0, 2.0], [1.666667, 1.75]]
- }, {
- "testcase_name": "min",
- "reduction_str": "min",
- "expected_output": [[1.0, 1.0], [0.0, 0.0]]
- }, {
- "testcase_name": "prod",
- "reduction_str": "prod",
- "expected_output": [[12.0, 12.0], [0.0, 0.0]]
- }, {
- "testcase_name": "sum",
- "reduction_str": "sum",
- "expected_output": [[8.0, 8.0], [2.5, 7.0]]
- }, {
- "testcase_name": "sqrtn",
- "reduction_str": "sqrtn",
- "expected_output": [[3.265986, 3.265986], [2.236067, 2.213594]]
- })
- def test_weighted_dense_reduction(self, reduction_str, expected_output):
- data = np.array([[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
- [[3.0, 1.0], [1.0, 2.0], [0.0, 0.0]]])
- input_tensor = keras.Input(shape=(None, None))
-
- weights = np.array([[[1.0, 1.0], [2.0, 2.0], [1.0, 1.0]],
- [[0.5, 1.0], [1.0, 3.0], [0.0, 0.0]]])
- weight_input_tensor = keras.Input(shape=(None, None))
-
- output_tensor = reduction.Reduction(reduction=reduction_str)(
- input_tensor, weights=weight_input_tensor)
- model = keras.Model([input_tensor, weight_input_tensor], output_tensor)
-
- output = model.predict([data, weights])
-
- self.assertAllClose(expected_output, output)
-
- def test_weighted_dense_reduction_with_different_dimensionality(self):
- data = np.array([[[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]],
- [[3.0, 1.0], [1.0, 2.0], [0.0, 0.0]]])
- input_tensor = keras.Input(shape=(None, None))
-
- weights = np.array([[1.0, 2.0, 1.0], [1.0, 1.0, 0.0]])
- weight_input_tensor = keras.Input(shape=(None,))
-
- output_tensor = reduction.Reduction(reduction="mean")(
- input_tensor, weights=weight_input_tensor)
- model = keras.Model([input_tensor, weight_input_tensor], output_tensor)
-
- output = model.predict([data, weights])
- expected_output = [[2.0, 2.0], [2.0, 1.5]]
- self.assertAllClose(expected_output, output)
-
- def test_sqrtn_fails_on_unweighted_ragged(self):
- input_tensor = keras.Input(shape=(None, None), ragged=True)
- with self.assertRaisesRegex(ValueError, ".*sqrtn.*"):
- _ = reduction.Reduction(reduction="sqrtn")(input_tensor)
-
- def test_sqrtn_fails_on_unweighted_dense(self):
- input_tensor = keras.Input(shape=(None, None))
- with self.assertRaisesRegex(ValueError, ".*sqrtn.*"):
- _ = reduction.Reduction(reduction="sqrtn")(input_tensor)
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
deleted file mode 100644
index 0c5c130..0000000
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras string lookup preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.keras.layers.preprocessing import table_utils
-from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export("keras.layers.experimental.preprocessing.StringLookup", v1=[])
-class StringLookup(index_lookup.IndexLookup):
- """Maps strings from a vocabulary to integer indices.
-
- This layer translates a set of arbitrary strings into an integer output via a
- table-based vocabulary lookup.
-
- The vocabulary for the layer can be supplied on construction or learned via
- `adapt()`. During `adapt()`, the layer will analyze a data set, determine the
- frequency of individual strings tokens, and create a vocabulary from them. If
- the vocabulary is capped in size, the most frequent tokens will be used to
- create the vocabulary and all others will be treated as out-of-vocabulary
- (OOV).
-
- There are two possible output modes for the layer.
- When `output_mode` is `"int"`,
- input strings are converted to their index in the vocabulary (an integer).
- When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`, input strings
- are encoded into an array where each dimension corresponds to an element in
- the vocabulary.
-
- The vocabulary can optionally contain a mask token as well as an OOV token
- (which can optionally occupy multiple indices in the vocabulary, as set
- by `num_oov_indices`).
- The position of these tokens in the vocabulary is fixed. When `output_mode` is
- `"int"`, the vocabulary will begin with the mask token (if set), followed by
- OOV indices, followed by the rest of the vocabulary. When `output_mode` is
- `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will begin with OOV
- indices and instances of the mask token will be dropped.
-
- Args:
- max_tokens: The maximum size of the vocabulary for this layer. If None,
- there is no cap on the size of the vocabulary. Note that this size
- includes the OOV and mask tokens. Default to None.
- num_oov_indices: The number of out-of-vocabulary tokens to use. If this
- value is more than 1, OOV inputs are hashed to determine their OOV value.
- If this value is 0, OOV inputs will cause an error when calling the layer.
- Defaults to 1.
- mask_token: A token that represents masked inputs. When `output_mode` is
- `"int"`, the token is included in vocabulary and mapped to index 0. In
- other output modes, the token will not appear in the vocabulary and
- instances of the mask token in the input will be dropped. If set to None,
- no mask term will be added. Defaults to `None`.
- oov_token: Only used when `invert` is True. The token to return for OOV
- indices. Defaults to `"[UNK]"`.
- vocabulary: An optional list of tokens, or a path to a text file containing
- a vocabulary to load into this layer. The file should contain one token
- per line. If the list or file contains the same token multiple times, an
- error will be thrown.
- invert: Only valid when `output_mode` is `"int"`. If True, this layer will
- map indices to vocabulary items instead of mapping vocabulary items to
- indices. Default to False.
- output_mode: Specification for the output of the layer. Defaults to `"int"`.
- Values can be `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or
- `"tf_idf"` configuring the layer as follows:
- - `"int"`: Return the raw integer indices of the input tokens.
- - `"one_hot"`: Encodes each individual element in the input into an
- array the same size as the vocabulary, containing a 1 at the element
- index. If the last dimension is size 1, will encode on that dimension.
- If the last dimension is not size 1, will append a new dimension for
- the encoded output.
- - `"multi_hot"`: Encodes each sample in the input into a single array
- the same size as the vocabulary, containing a 1 for each vocabulary
- term present in the sample. Treats the last dimension as the sample
- dimension, if input shape is (..., sample_length), output shape will
- be (..., num_tokens).
- - `"count"`: As `"multi_hot"`, but the int array contains a count of the
- number of times the token at that index appeared in the sample.
- - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
- find the value in each token slot.
- pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,
- `"count"`, or `"tf_idf"`. If True, the output will have its feature axis
- padded to `max_tokens` even if the number of unique tokens in the
- vocabulary is less than max_tokens, resulting in a tensor of shape
- [batch_size, max_tokens] regardless of vocabulary size. Defaults to False.
- sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,
- `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a
- dense `Tensor`. Defaults to False.
-
- Examples:
-
- **Creating a lookup layer with a known vocabulary**
-
- This example creates a lookup layer with a pre-existing vocabulary.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
- >>> layer = StringLookup(vocabulary=vocab)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[1, 3, 4],
- [4, 0, 2]])>
-
- **Creating a lookup layer with an adapted vocabulary**
-
- This example creates a lookup layer and generates the vocabulary by analyzing
- the dataset.
-
- >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
- >>> layer = StringLookup()
- >>> layer.adapt(data)
- >>> layer.get_vocabulary()
- ['[UNK]', 'd', 'z', 'c', 'b', 'a']
-
- Note that the OOV token [UNK] has been added to the vocabulary. The remaining
- tokens are sorted by frequency ('d', which has 2 occurrences, is first) then
- by inverse sort order.
-
- >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
- >>> layer = StringLookup()
- >>> layer.adapt(data)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[5, 3, 1],
- [1, 2, 4]])>
-
- **Lookups with multiple OOV indices**
-
- This example demonstrates how to use a lookup layer with multiple OOV indices.
- When a layer is created with more than one OOV index, any OOV values are
- hashed into the number of OOV buckets, distributing OOV values in a
- deterministic fashion across the set.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])
- >>> layer = StringLookup(vocabulary=vocab, num_oov_indices=2)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
- array([[2, 4, 5],
- [0, 1, 3]])>
-
- Note that the output for OOV value 'm' is 0, while the output for OOV value
- 'z' is 1. The in-vocab terms have their output index increased by 1 from
- earlier examples (a maps to 2, etc) in order to make space for the extra OOV
- value.
-
- **One-hot output**
-
- Configure the layer with `output_mode='one_hot'`. Note that the first
- `num_oov_indices` dimensions in the ont_hot encoding represent OOV values.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant(["a", "b", "c", "d", "z"])
- >>> layer = StringLookup(vocabulary=vocab, output_mode='one_hot')
- >>> layer(data)
- <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
- array([[0., 1., 0., 0., 0.],
- [0., 0., 1., 0., 0.],
- [0., 0., 0., 1., 0.],
- [0., 0., 0., 0., 1.],
- [1., 0., 0., 0., 0.]], dtype=float32)>
-
- **Multi-hot output**
-
- Configure the layer with `output_mode='multi_hot'`. Note that the first
- `num_oov_indices` dimensions in the multi_hot encoding represent OOV values.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
- >>> layer = StringLookup(vocabulary=vocab, output_mode='multi_hot')
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0., 1., 0., 1., 1.],
- [1., 0., 1., 0., 1.]], dtype=float32)>
-
- **Token count output**
-
- Configure the layer with `output_mode='count'`. As with multi_hot output, the
- first `num_oov_indices` dimensions in the output represent OOV values.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
- >>> layer = StringLookup(vocabulary=vocab, output_mode='count')
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0., 1., 0., 1., 2.],
- [2., 0., 1., 0., 1.]], dtype=float32)>
-
- **TF-IDF output**
-
- Configure the layer with `output_mode='tf_idf'`. As with multi_hot output, the
- first `num_oov_indices` dimensions in the output represent OOV values.
-
- Each token bin will output `token_count * idf_weight`, where the idf weights
- are the inverse document frequency weights per token. These should be provided
- along with the vocabulary. Note that the `idf_weight` for OOV values will
- default to the average of all idf weights passed in.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
- >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
- >>> layer = StringLookup(output_mode='tf_idf')
- >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0. , 0.25, 0. , 0.6 , 0.8 ],
- [1.0 , 0. , 0.75, 0. , 0.4 ]], dtype=float32)>
-
- To specify the idf weights for oov values, you will need to pass the entire
- vocabularly including the leading oov token.
-
- >>> vocab = ["[UNK]", "a", "b", "c", "d"]
- >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
- >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])
- >>> layer = StringLookup(output_mode='tf_idf')
- >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
- >>> layer(data)
- <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
- array([[0. , 0.25, 0. , 0.6 , 0.8 ],
- [1.8 , 0. , 0.75, 0. , 0.4 ]], dtype=float32)>
-
- When adapting the layer in tf_idf mode, each input sample will be considered a
- document, and idf weight per token will be calculated as
- `log(1 + num_documents / (1 + token_document_count))`.
-
- **Inverse lookup**
-
- This example demonstrates how to map indices to strings using this layer. (You
- can also use adapt() with inverse=True, but for simplicity we'll pass the
- vocab in this example.)
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])
- >>> layer = StringLookup(vocabulary=vocab, invert=True)
- >>> layer(data)
- <tf.Tensor: shape=(2, 3), dtype=string, numpy=
- array([[b'a', b'c', b'd'],
- [b'd', b'[UNK]', b'b']], dtype=object)>
-
- Note that the first index correspond to the oov token by default.
-
-
- **Forward and inverse lookup pairs**
-
- This example demonstrates how to use the vocabulary of a standard lookup
- layer to create an inverse lookup layer.
-
- >>> vocab = ["a", "b", "c", "d"]
- >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
- >>> layer = StringLookup(vocabulary=vocab)
- >>> i_layer = StringLookup(vocabulary=vocab, invert=True)
- >>> int_data = layer(data)
- >>> i_layer(int_data)
- <tf.Tensor: shape=(2, 3), dtype=string, numpy=
- array([[b'a', b'c', b'd'],
- [b'd', b'[UNK]', b'b']], dtype=object)>
-
- In this example, the input value 'z' resulted in an output of '[UNK]', since
- 1000 was not in the vocabulary - it got represented as an OOV, and all OOV
- values are returned as '[OOV}' in the inverse layer. Also, note that for the
- inverse to work, you must have already set the forward layer vocabulary
- either directly or via adapt() before calling get_vocabulary().
- """
-
- def __init__(self,
- max_tokens=None,
- num_oov_indices=1,
- mask_token=None,
- oov_token="[UNK]",
- vocabulary=None,
- encoding=None,
- invert=False,
- output_mode=index_lookup.INT,
- sparse=False,
- pad_to_max_tokens=False,
- **kwargs):
- allowed_dtypes = [dtypes.string]
-
- if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
- raise ValueError("The value of the dtype argument for StringLookup may "
- "only be one of %s." % (allowed_dtypes,))
-
- if "dtype" not in kwargs:
- kwargs["dtype"] = dtypes.string
-
- if encoding is None:
- encoding = "utf-8"
-
- self.encoding = encoding
-
- super(StringLookup, self).__init__(
- max_tokens=max_tokens,
- num_oov_indices=num_oov_indices,
- mask_token=mask_token,
- oov_token=oov_token,
- vocabulary=vocabulary,
- invert=invert,
- output_mode=output_mode,
- sparse=sparse,
- pad_to_max_tokens=pad_to_max_tokens,
- **kwargs)
-
- def get_config(self):
- config = {"encoding": self.encoding}
- base_config = super(StringLookup, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
- def set_vocabulary(self, vocabulary, idf_weights=None):
- if isinstance(vocabulary, str):
- if self.output_mode == index_lookup.TF_IDF:
- raise RuntimeError("Setting vocabulary directly from a file is not "
- "supported in TF-IDF mode, since this layer cannot "
- "read files containing TF-IDF weight data. Please "
- "read the file using Python and set the vocabulary "
- "and weights by passing lists or arrays to the "
- "set_vocabulary function's `vocabulary` and "
- "`idf_weights` args.")
- vocabulary = table_utils.get_vocabulary_from_file(vocabulary,
- self.encoding)
- super().set_vocabulary(vocabulary, idf_weights=idf_weights)
-
- # Overriden methods from IndexLookup.
- def _tensor_vocab_to_numpy(self, vocabulary):
- vocabulary = vocabulary.numpy()
- return np.array([compat.as_text(x, self.encoding) for x in vocabulary])
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
deleted file mode 100644
index cdd8cfb..0000000
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras text vectorization preprocessing layer."""
-
-import os
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.layers.preprocessing import string_lookup
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-def _get_end_to_end_test_cases():
- test_cases = (
- {
- "testcase_name": "test_strings_soft_vocab_cap",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # accumulator is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- },
- "expected_output": [[1], [2], [3], [4], [4], [3], [1], [0]],
- "input_dtype":
- dtypes.string
- },
- )
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class StringLookupLayerTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters(*_get_end_to_end_test_cases())
- def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
- use_dataset, expected_output,
- input_dtype):
- cls = string_lookup.StringLookup
- expected_output_dtype = dtypes.int64
- input_shape = input_data.shape
-
- if use_dataset:
- # Keras APIs expect batched datasets.
- # TODO(rachelim): `model.predict` predicts the result on each
- # dataset batch separately, then tries to concatenate the results
- # together. When the results have different shapes on the non-concat
- # axis (which can happen in the output_mode = INT case for
- # StringLookup), the concatenation fails. In real use cases, this may
- # not be an issue because users are likely to pipe the preprocessing layer
- # into other keras layers instead of predicting it directly. A workaround
- # for these unit tests is to have the dataset only contain one batch, so
- # no concatenation needs to happen with the result. For consistency with
- # numpy input, we should make `predict` join differently shaped results
- # together sensibly, with 0 padding.
- input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
- input_shape[0])
- vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
- input_shape[0])
-
- with CustomObjectScope({"StringLookup": cls}):
- output_data = testing_utils.layer_test(
- cls,
- kwargs=kwargs,
- input_shape=input_shape,
- input_data=input_data,
- input_dtype=input_dtype,
- expected_output_dtype=expected_output_dtype,
- validate_training=False,
- adapt_data=vocab_data)
- self.assertAllClose(expected_output, output_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class StringLookupVocabularyTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- def _write_to_temp_file(self, file_name, vocab_list):
- vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
- with gfile.GFile(vocab_path, "w") as writer:
- for vocab in vocab_list:
- writer.write(vocab + "\n")
- writer.flush()
- writer.close()
- return vocab_path
-
- def test_int_output_explicit_vocab(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(vocabulary=vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_int_output_explicit_vocab_with_special_tokens(self):
- vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(vocabulary=vocab_data, mask_token="")
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_int_output_no_oov(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- valid_input = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", ""]])
- invalid_input = np.array([["earth", "wind", "and", "michigan"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_data, mask_token="", num_oov_indices=0)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(valid_input)
- self.assertAllEqual(expected_output, output_data)
- with self.assertRaisesRegex(errors.InvalidArgumentError,
- "found OOV values.*michigan"):
- _ = model.predict(invalid_input)
-
- def test_no_vocab(self):
- with self.assertRaisesRegex(
- ValueError, "You must set the layer's vocabulary"):
- layer = string_lookup.StringLookup()
- layer([["a"]])
-
- def test_one_hot_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array(["earth", "wind", "and", "fire", "michigan"])
- expected_output = [
- [0, 1, 0, 0, 0],
- [0, 0, 1, 0, 0],
- [0, 0, 0, 1, 0],
- [0, 0, 0, 0, 1],
- [1, 0, 0, 0, 0],
- ]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_data, output_mode="one_hot")
- res = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=res)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_multi_hot_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[0, 1, 1, 1, 1], [1, 1, 0, 1, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_data, output_mode="multi_hot")
- res = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=res)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_count_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "earth", "fire", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[0, 2, 0, 0, 2], [1, 1, 0, 1, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_data, output_mode="count")
- res = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=res)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_sparse_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_data, output_mode="multi_hot", sparse=True)
- res = layer(input_data)
- self.assertTrue(res.__class__.__name__, "SparseKerasTensor")
-
- def test_get_vocab_returns_str(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- expected_vocab = ["[UNK]", "earth", "wind", "and", "fire"]
- layer = string_lookup.StringLookup(vocabulary=vocab_data)
- layer_vocab = layer.get_vocabulary()
- self.assertAllEqual(expected_vocab, layer_vocab)
- self.assertIsInstance(layer_vocab[0], str)
-
- inverse_layer = string_lookup.StringLookup(
- vocabulary=layer.get_vocabulary(), invert=True)
- layer_vocab = inverse_layer.get_vocabulary()
- self.assertAllEqual(expected_vocab, layer_vocab)
- self.assertIsInstance(layer_vocab[0], str)
-
- def test_int_output_explicit_vocab_from_file(self):
- vocab_list = ["earth", "wind", "and", "fire"]
- vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(vocabulary=vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_int_output_explicit_vocab_from_file_via_setter(self):
- vocab_list = ["earth", "wind", "and", "fire"]
- vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup()
- layer.set_vocabulary(vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_non_unique_vocab_fails(self):
- vocab_data = ["earth", "wind", "and", "fire", "fire"]
- with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
- _ = string_lookup.StringLookup(vocabulary=vocab_data)
-
- def test_non_unique_vocab_from_file_fails(self):
- vocab_list = ["earth", "wind", "and", "fire", "earth"]
- vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
- with self.assertRaisesRegex(
- errors_impl.FailedPreconditionError,
- "HashTable has different value for same key.*earth"):
- _ = string_lookup.StringLookup(vocabulary=vocab_path)
-
- def test_inverse_layer(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
- expected_output = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", ""]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_data, invert=True, mask_token="")
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_inverse_layer_from_file(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
- expected_output = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "[UNK]"]])
- vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = string_lookup.StringLookup(vocabulary=vocab_path, invert=True)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_inverse_layer_from_file_with_mask(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 0]])
- expected_output = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "[M]"]])
- vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
- layer = string_lookup.StringLookup(
- vocabulary=vocab_path, invert=True, mask_token="[M]")
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_forward_backward_explicit_vocab(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "[UNK]"]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup(vocabulary=vocab_data)
- invert_layer = string_lookup.StringLookup(
- vocabulary=vocab_data, invert=True)
- int_data = layer(input_data)
- out_data = invert_layer(int_data)
- model = keras.Model(inputs=input_data, outputs=out_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_forward_backward_adapted_vocab(self):
- adapt_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "[UNK]"]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = string_lookup.StringLookup()
- layer.adapt(adapt_data)
- invert_layer = string_lookup.StringLookup(
- vocabulary=layer.get_vocabulary(), invert=True)
- int_data = layer(input_data)
- out_data = invert_layer(int_data)
- model = keras.Model(inputs=input_data, outputs=out_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_ragged_string_input_multi_bucket(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
- ["fire", "and", "earth",
- "ohio"]])
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
- layer = string_lookup.StringLookup(num_oov_indices=2)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_data = model.predict(input_array)
- self.assertAllEqual(expected_output, output_data)
-
- def test_tensor_vocab(self):
- vocab_data = ["[UNK]", "wind", "and", "fire"]
- vocab_tensor = constant_op.constant(vocab_data)
- layer = string_lookup.StringLookup(vocabulary=vocab_tensor)
- returned_vocab = layer.get_vocabulary()
- self.assertAllEqual(vocab_data, returned_vocab)
- self.assertAllEqual(layer.vocabulary_size(), 4)
- fn = def_function.function(lambda: layer.set_vocabulary(vocab_tensor))
- with self.assertRaisesRegex(RuntimeError, "Cannot set a tensor vocabulary"):
- fn()
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
deleted file mode 100644
index 264e6a1..0000000
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for working with tf.lookup tables in Keras."""
-
-import collections
-import os
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_tensor_value
-from tensorflow.python.platform import gfile
-
-
-class TableHandler(object):
- """Wrapper object that holds a lookup table and provides accessors."""
-
- def __init__(self,
- table,
- oov_tokens=None,
- mask_token=None,
- mask_value=0):
- self.table = table
- self.mutable = isinstance(table, lookup_ops.MutableHashTable)
- self.mask_token = mask_token
- self.mask_value = mask_value
-
- if oov_tokens is None:
- self.oov_tokens = oov_tokens
- else:
- if not isinstance(oov_tokens, (list, tuple, np.ndarray)):
- oov_tokens = [oov_tokens]
- self.oov_tokens = math_ops.cast(oov_tokens, table._value_dtype) # pylint: disable=protected-access
-
- def table_size(self):
- return self.table.size().numpy()
-
- def clear(self):
- if not self.mutable:
- return RuntimeError("Unable to clear a statically-backed table.")
-
- keys, _ = self.table.export()
- self.table.remove(keys)
-
- def insert(self, keys, values):
- """Insert values into the backed table."""
- if not self.mutable:
- raise RuntimeError("Unable to insert into a statically-backed table.")
-
- if len(values) != len(keys):
- raise RuntimeError("Size mismatch between values and key arrays. "
- "Keys had size %s, values had size %s." %
- (len(keys), len(values)))
- keys = ops.convert_to_tensor_v2_with_dispatch(
- keys, dtype=self.table._key_dtype) # pylint: disable=protected-access
- values = ops.convert_to_tensor_v2_with_dispatch(
- values, dtype=self.table._value_dtype) # pylint: disable=protected-access
- if values.shape.ndims != 1:
- raise ValueError("`values` must be 1-dimensional, got an input with "
- " %s dimensions." % values.shape.ndims)
- self.table.insert(keys, values)
-
- def _replace_oov_buckets(self, inputs, lookups):
- """Replace the default OOV value with one of the OOV bucket values."""
- if self.oov_tokens is None:
- return lookups
-
- num_oov_elements = self.oov_tokens.shape.num_elements()
- if inputs.dtype.is_integer:
- oov_indices = math_ops.floormod(inputs, num_oov_elements)
- else:
- oov_indices = string_ops.string_to_hash_bucket_fast(
- inputs, num_buckets=num_oov_elements)
-
- oov_values = array_ops.gather(self.oov_tokens, oov_indices)
- oov_locations = math_ops.equal(lookups, self.table._default_value) # pylint: disable=protected-access
-
- return array_ops.where(oov_locations, oov_values, lookups)
-
- def _lookup_and_mask(self, inputs):
- """Return a lookup with any location with the mask_token masked to 0."""
- lookups = self.table.lookup(inputs)
- # If we don't need to handle masking, return the lookup values directly.
- if self.mask_token is None:
- return lookups
-
- # Inject 0s wherever the mask token was in the inputs.
- mask_locations = math_ops.equal(inputs, self.mask_token)
- return array_ops.where_v2(
- mask_locations,
- math_ops.cast(self.mask_value, self.table._value_dtype), # pylint: disable=protected-access
- lookups) # pylint: disable=protected-access
-
- def _ragged_lookup(self, inputs):
- """Perform a table lookup on a ragged tensor."""
- # The table lookup ops don't natively support ragged tensors, so if we have
- # a RT we need to use map_flat_values to look up every element.
- indexed_data = ragged_functional_ops.map_flat_values(
- self._lookup_and_mask, inputs)
- indexed_data = ragged_functional_ops.map_flat_values(
- self._replace_oov_buckets, inputs, indexed_data)
- # table.lookup is not shape-preserving, so we need to set the shape here.
- indexed_data._set_shape(inputs.shape) # pylint: disable=protected-access
- # Composite tensors can pass tensor values through, which will cause
- # errors if all operations in the TF graph do so. We can break this chain
- # with an identity here.
- return array_ops.identity(indexed_data)
-
- def _sparse_lookup(self, inputs):
- """Perform a table lookup on a sparse tensor."""
- values = self._lookup_and_mask(inputs.values)
- values = self._replace_oov_buckets(inputs.values, values)
- indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
- inputs.dense_shape)
- # Composite tensors can pass tensor values through, which will cause
- # errors if all operations in the TF graph do so. We can break this chain
- # with an identity here.
- return array_ops.identity(indexed_data)
-
- def _tensor_lookup(self, inputs):
- """Perform a table lookup on a tf.tensor."""
- values = self._lookup_and_mask(inputs)
- indexed_data = self._replace_oov_buckets(inputs, values)
- # (b/149446477): output does not preserve input shape.
- indexed_data.set_shape(inputs.shape)
- return indexed_data
-
- def lookup(self, inputs):
- """Perform a table lookup."""
- # Sparse tensors don't play nicely with tensor conversion, so we handle
- # them before attempting to convert lists or arrays to tensors.
- if isinstance(
- inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
- return self._sparse_lookup(inputs)
-
- if tf_utils.is_ragged(inputs):
- if isinstance(inputs, ragged_tensor_value.RaggedTensorValue):
- flat_values = ops.convert_to_tensor_v2_with_dispatch(
- value=inputs.flat_values, name="flat_values")
- inputs = ragged_tensor.RaggedTensor.from_nested_row_splits(
- flat_values, inputs.nested_row_splits, validate=False)
- return self._ragged_lookup(inputs)
-
- # For normal tensor inputs
- inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
- return self._tensor_lookup(inputs)
-
-
-def num_tokens_in_file(vocabulary_path):
- """Count the number of lines in a vocab file to get the number of tokens."""
- num_tokens = 0
- with gfile.GFile(vocabulary_path, "r") as reader:
- text = reader.readline()
- while text:
- num_tokens += 1
- text = reader.readline()
-
- return num_tokens
-
-
-def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
- """Read a vocabulary in from a file."""
- vocab = []
- with gfile.GFile(vocabulary_path, "r") as reader:
- while True:
- # Get the next line (incl. \n), and break if nothing is left to read.
- text = reader.readline()
- if not text:
- break
-
- # Convert the raw text and strip whitespace.
- if isinstance(text, str):
- token = text
- elif isinstance(text, bytes):
- token = text.decode(encoding, "ignore")
- token = token.rstrip(os.linesep)
- vocab.append(token)
- return vocab
-
-
-def find_repeated_tokens(vocabulary):
- """Return all repeated tokens in a vocabulary."""
- vocabulary_set = set(vocabulary)
- if len(vocabulary) != len(vocabulary_set):
- return [
- item for item, count in collections.Counter(vocabulary).items()
- if count > 1
- ]
- else:
- return []
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
deleted file mode 100644
index 4a46ce0..0000000
--- a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras lookup table utils."""
-
-import os
-import tempfile
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.layers.preprocessing import table_utils
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-def get_table(dtype=dtypes.string, oov_tokens=None):
- table = lookup_ops.MutableHashTable(
- key_dtype=dtype,
- value_dtype=dtypes.int64,
- default_value=-7,
- name="index_table")
- return table_utils.TableHandler(table, oov_tokens)
-
-
-def get_static_table(tmpdir,
- vocab_list,
- mask_token=None,
- dtype=dtypes.string,
- oov_tokens=None):
- vocabulary_file = os.path.join(tmpdir, "tmp_vocab.txt")
-
- if dtype == dtypes.string:
- with open(vocabulary_file, "w") as f:
- f.write("\n".join(vocab_list) + "\n")
- else:
- with open(vocabulary_file, "w") as f:
- f.write("\n".join([str(v) for v in vocab_list]) + "\n")
-
- offset = ((0 if mask_token is None else 1) +
- (len(oov_tokens) if oov_tokens is not None else 0))
- init = lookup_ops.TextFileInitializer(
- vocabulary_file,
- dtype,
- lookup_ops.TextFileIndex.WHOLE_LINE,
- dtypes.int64,
- lookup_ops.TextFileIndex.LINE_NUMBER,
- value_index_offset=offset)
- table = lookup_ops.StaticHashTable(init, default_value=-7)
- return table_utils.TableHandler(
- table,
- oov_tokens,
- mask_token=mask_token)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingInputTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=["fire", "michigan"],
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- table = get_table(oov_tokens=[1])
- table.insert(vocab_data, range(2, len(vocab_data) + 2))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_sparse_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 32], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- table = get_table(dtype=dtypes.int64, oov_tokens=[1])
- table.insert(vocab_data, range(2, len(vocab_data) + 2))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant(
- [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- table = get_table(oov_tokens=[1])
- table.insert(vocab_data, range(2, len(vocab_data) + 2))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_ragged_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
- dtype=np.int64)
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- table = get_table(dtype=dtypes.int64, oov_tokens=[1])
- table.insert(vocab_data, range(2, len(vocab_data) + 2))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_tensor_multi_dim_values_fails(self):
- key_data = np.array([0, 1], dtype=np.int64)
- value_data = np.array([[11, 12], [21, 22]])
-
- table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
-
- with self.assertRaisesRegex(ValueError, "must be 1-dimensional"):
- table.insert(key_data, value_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingMultiOOVTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_string_input_multi_bucket(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]], values=["fire", "ohio"], dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [6, 2]
- expected_dense_shape = [3, 4]
-
- table = get_table(oov_tokens=[1, 2])
- table.insert(vocab_data, range(3, len(vocab_data) + 3))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_sparse_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 132], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [6, 1]
- expected_dense_shape = [3, 4]
-
- table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
- table.insert(vocab_data, range(3, len(vocab_data) + 3))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_string_input_multi_bucket(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
- ["fire", "and", "earth",
- "ohio"]])
- expected_output = [[3, 4, 6], [6, 5, 3, 2]]
-
- table = get_table(oov_tokens=[1, 2])
- table.insert(vocab_data, range(3, len(vocab_data) + 3))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_ragged_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
- dtype=np.int64)
- expected_output = [[3, 4, 6], [6, 5, 3, 1]]
-
- table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
- table.insert(vocab_data, range(3, len(vocab_data) + 3))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_tensor_int_input_multi_bucket(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = np.array([[13, 132], [13, 133]], dtype=np.int64)
- expected_values = [[6, 1], [6, 2]]
-
- table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
- table.insert(vocab_data, range(3, len(vocab_data) + 3))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_values, output_data)
-
- def test_tensor_string_input_multi_bucket(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = [["earth", "wind", "fire", "michigan"],
- ["fire", "and", "earth", "ohio"]]
- expected_output = [[3, 4, 6, 1], [6, 5, 3, 2]]
-
- table = get_table(oov_tokens=[1, 2])
- table.insert(vocab_data, range(3, len(vocab_data) + 3))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class IndexLookupOutputTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_int_output_default_lookup_value(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, -7]]
-
- table = get_table(oov_tokens=None)
- table.insert(vocab_data, range(1, len(vocab_data) + 1))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_output_shape(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
-
- table = get_table()
- table.insert(vocab_data, range(1, len(vocab_data) + 1))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(input_array.shape[1:], output_data.shape[1:])
-
- def test_int_output_no_reserved_zero_default_lookup_value(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[0, 1, 2, 3], [3, 2, 0, -7]]
-
- table = get_table(oov_tokens=None)
- table.insert(vocab_data, range(len(vocab_data)))
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class StaticIndexLookupOutputTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_int_output_default_lookup_value(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[1, 2, 3, 4], [4, 3, 1, -7]]
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(),
- vocab_list=vocab_data,
- mask_token="",
- oov_tokens=None)
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_output_shape(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(), vocab_list=vocab_data, oov_tokens=None)
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(input_array.shape[1:], output_data.shape[1:])
-
- def test_int_output_no_reserved_zero_default_lookup_value(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[0, 1, 2, 3], [3, 2, 0, -7]]
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(), vocab_list=vocab_data, oov_tokens=None)
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingStaticInputTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_sparse_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=["fire", "michigan"],
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(),
- vocab_list=vocab_data,
- mask_token="",
- oov_tokens=[1])
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_sparse_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = sparse_tensor.SparseTensor(
- indices=[[0, 0], [1, 2]],
- values=np.array([13, 32], dtype=np.int64),
- dense_shape=[3, 4])
-
- expected_indices = [[0, 0], [1, 2]]
- expected_values = [5, 1]
- expected_dense_shape = [3, 4]
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(),
- vocab_list=vocab_data,
- dtype=dtypes.int64,
- mask_token=0,
- oov_tokens=[1])
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_indices, output_data.indices)
- self.assertAllEqual(expected_values, output_data.values)
- self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
-
- def test_ragged_string_input(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = ragged_factory_ops.constant(
- [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(),
- vocab_list=vocab_data,
- mask_token="",
- oov_tokens=[1])
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
- def test_ragged_int_input(self):
- vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
- input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
- dtype=np.int64)
- expected_output = [[2, 3, 5], [5, 4, 2, 1]]
-
- table = get_static_table(
- tmpdir=self.get_temp_dir(),
- vocab_list=vocab_data,
- dtype=dtypes.int64,
- mask_token=0,
- oov_tokens=[1])
- output_data = table.lookup(input_array)
-
- self.assertAllEqual(expected_output, output_data)
-
-
-class GetVocabularyFromFileTest(test.TestCase):
-
- def setUp(self):
- super(GetVocabularyFromFileTest, self).setUp()
- dir_path = tempfile.mkdtemp(prefix=test.get_temp_dir())
- self._vocab_path = os.path.join(dir_path, "vocab")
-
- def test_only_line_separator_is_stripped(self):
- expected = ["foo", " foo", "foo ", " foo "]
- with gfile.GFile(self._vocab_path, "w") as writer:
- for word in expected:
- writer.write(word)
- writer.write(os.linesep)
-
- actual = actual = table_utils.get_vocabulary_from_file(self._vocab_path)
- self.assertAllEqual(expected, actual)
-
- def test_linux_file(self):
- content = b"line1\nline2\nline3"
- with gfile.GFile(self._vocab_path, "wb") as writer:
- writer.write(content)
-
- actual = table_utils.get_vocabulary_from_file(self._vocab_path)
- self.assertAllEqual(["line1", "line2", "line3"], actual)
-
- def test_windows_file(self):
- content = b"line1\r\nline2\r\nline3"
- with gfile.GFile(self._vocab_path, "wb") as writer:
- writer.write(content)
-
- actual = table_utils.get_vocabulary_from_file(self._vocab_path)
- self.assertAllEqual(["line1", "line2", "line3"], actual)
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
deleted file mode 100644
index 4aac6c2..0000000
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ /dev/null
@@ -1,572 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras text vectorization preprocessing layer."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.keras.layers.preprocessing import string_lookup
-from tensorflow.python.keras.utils import layer_utils
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_string_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_string_ops
-from tensorflow.python.util.tf_export import keras_export
-
-LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
-
-SPLIT_ON_WHITESPACE = "whitespace"
-
-TF_IDF = index_lookup.TF_IDF
-INT = index_lookup.INT
-MULTI_HOT = index_lookup.MULTI_HOT
-COUNT = index_lookup.COUNT
-
-# This is an explicit regex of all the tokens that will be stripped if
-# LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
-# stripping, a Callable should be passed into the 'standardize' arg.
-DEFAULT_STRIP_REGEX = r'[!"#$%&()\*\+,-\./:;<=>?@\[\\\]^_`{|}~\']'
-
-# The string tokens in the extracted vocabulary
-_VOCAB_NAME = "vocab"
-# The inverse-document-frequency weights
-_IDF_NAME = "idf"
-# The IDF data for the OOV token
-_OOV_IDF_NAME = "oov_idf"
-
-# The string tokens in the full vocabulary
-_ACCUMULATOR_VOCAB_NAME = "vocab"
-# The total counts of each token in the vocabulary
-_ACCUMULATOR_COUNTS_NAME = "counts"
-# The number of documents / examples that each token appears in.
-_ACCUMULATOR_DOCUMENT_COUNTS = "document_counts"
-# The total number of documents / examples in the dataset.
-_ACCUMULATOR_NUM_DOCUMENTS = "num_documents"
-
-
-@keras_export(
- "keras.layers.experimental.preprocessing.TextVectorization", v1=[])
-class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
- """Text vectorization layer.
-
- This layer has basic options for managing text in a Keras model. It
- transforms a batch of strings (one example = one string) into either a list of
- token indices (one example = 1D tensor of integer token indices) or a dense
- representation (one example = 1D tensor of float values representing data
- about the example's tokens).
-
- If desired, the user can call this layer's adapt() method on a dataset.
- When this layer is adapted, it will analyze the dataset, determine the
- frequency of individual string values, and create a 'vocabulary' from them.
- This vocabulary can have unlimited size or be capped, depending on the
- configuration options for this layer; if there are more unique values in the
- input than the maximum vocabulary size, the most frequent terms will be used
- to create the vocabulary.
-
- The processing of each example contains the following steps:
-
- 1. standardize each example (usually lowercasing + punctuation stripping)
- 2. split each example into substrings (usually words)
- 3. recombine substrings into tokens (usually ngrams)
- 4. index tokens (associate a unique int value with each token)
- 5. transform each example using this index, either into a vector of ints or
- a dense float vector.
-
- Some notes on passing Callables to customize splitting and normalization for
- this layer:
-
- 1. Any callable can be passed to this Layer, but if you want to serialize
- this object you should only pass functions that are registered Keras
- serializables (see `tf.keras.utils.register_keras_serializable` for more
- details).
- 2. When using a custom callable for `standardize`, the data received
- by the callable will be exactly as passed to this layer. The callable
- should return a tensor of the same shape as the input.
- 3. When using a custom callable for `split`, the data received by the
- callable will have the 1st dimension squeezed out - instead of
- `[["string to split"], ["another string to split"]]`, the Callable will
- see `["string to split", "another string to split"]`. The callable should
- return a Tensor with the first dimension containing the split tokens -
- in this example, we should see something like `[["string", "to",
- "split"], ["another", "string", "to", "split"]]`. This makes the callable
- site natively compatible with `tf.strings.split()`.
-
- Args:
- max_tokens: The maximum size of the vocabulary for this layer. If None,
- there is no cap on the size of the vocabulary. Note that this vocabulary
- contains 1 OOV token, so the effective number of tokens is `(max_tokens -
- 1 - (1 if output == `"int"` else 0))`.
- standardize: Optional specification for standardization to apply to the
- input text. Values can be None (no standardization),
- `"lower_and_strip_punctuation"` (lowercase and remove punctuation) or a
- Callable. Default is `"lower_and_strip_punctuation"`.
- split: Optional specification for splitting the input text. Values can be
- None (no splitting), `"whitespace"` (split on ASCII whitespace), or a
- Callable. The default is `"whitespace"`.
- ngrams: Optional specification for ngrams to create from the possibly-split
- input text. Values can be None, an integer or tuple of integers; passing
- an integer will create ngrams up to that integer, and passing a tuple of
- integers will create ngrams for the specified values in the tuple. Passing
- None means that no ngrams will be created.
- output_mode: Optional specification for the output of the layer. Values can
- be `"int"`, `"multi_hot"`, `"count"` or `"tf_idf"`, configuring the layer
- as follows:
- - `"int"`: Outputs integer indices, one integer index per split string
- token. When output == `"int"`, 0 is reserved for masked locations;
- this reduces the vocab size to max_tokens-2 instead of max_tokens-1
- - `"multi_hot"`: Outputs a single int array per batch, of either
- vocab_size or max_tokens size, containing 1s in all elements where the
- token mapped to that index exists at least once in the batch item.
- - `"count"`: As `"multi_hot"`, but the int array contains a count of the
- number of times the token at that index appeared in the batch item.
- - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to
- find the value in each token slot.
- output_sequence_length: Only valid in INT mode. If set, the output will have
- its time dimension padded or truncated to exactly `output_sequence_length`
- values, resulting in a tensor of shape [batch_size,
- output_sequence_length] regardless of how many tokens resulted from the
- splitting step. Defaults to None.
- pad_to_max_tokens: Only valid in `"multi_hot"`, `"count"`, and `"tf_idf"`
- modes. If True, the output will have its feature axis padded to
- `max_tokens` even if the number of unique tokens in the vocabulary is less
- than max_tokens, resulting in a tensor of shape [batch_size, max_tokens]
- regardless of vocabulary size. Defaults to False.
- vocabulary: An optional list of vocabulary terms, or a path to a text file
- containing a vocabulary to load into this layer. The file should contain
- one token per line. If the list or file contains the same token multiple
- times, an error will be thrown.
-
- Example:
-
- This example instantiates a TextVectorization layer that lowercases text,
- splits on whitespace, strips punctuation, and outputs integer vocab indices.
-
- >>> text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
- >>> max_features = 5000 # Maximum vocab size.
- >>> max_len = 4 # Sequence length to pad the outputs to.
- >>>
- >>> # Create the layer.
- >>> vectorize_layer = TextVectorization(
- ... max_tokens=max_features,
- ... output_mode='int',
- ... output_sequence_length=max_len)
- >>>
- >>> # Now that the vocab layer has been created, call `adapt` on the text-only
- >>> # dataset to create the vocabulary. You don't have to batch, but for large
- >>> # datasets this means we're not keeping spare copies of the dataset.
- >>> vectorize_layer.adapt(text_dataset.batch(64))
- >>>
- >>> # Create the model that uses the vectorize text layer
- >>> model = tf.keras.models.Sequential()
- >>>
- >>> # Start by creating an explicit input layer. It needs to have a shape of
- >>> # (1,) (because we need to guarantee that there is exactly one string
- >>> # input per batch), and the dtype needs to be 'string'.
- >>> model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
- >>>
- >>> # The first layer in our model is the vectorization layer. After this
- >>> # layer, we have a tensor of shape (batch_size, max_len) containing vocab
- >>> # indices.
- >>> model.add(vectorize_layer)
- >>>
- >>> # Now, the model can map strings to integers, and you can add an embedding
- >>> # layer to map these integers to learned embeddings.
- >>> input_data = [["foo qux bar"], ["qux baz"]]
- >>> model.predict(input_data)
- array([[2, 1, 4, 0],
- [1, 3, 0, 0]])
-
- Example:
-
- This example instantiates a TextVectorization layer by passing a list
- of vocabulary terms to the layer's __init__ method.
-
- >>> vocab_data = ["earth", "wind", "and", "fire"]
- >>> max_len = 4 # Sequence length to pad the outputs to.
- >>>
- >>> # Create the layer, passing the vocab directly. You can also pass the
- >>> # vocabulary arg a path to a file containing one vocabulary word per
- >>> # line.
- >>> vectorize_layer = TextVectorization(
- ... max_tokens=max_features,
- ... output_mode='int',
- ... output_sequence_length=max_len,
- ... vocabulary=vocab_data)
- >>>
- >>> # Because we've passed the vocabulary directly, we don't need to adapt
- >>> # the layer - the vocabulary is already set. The vocabulary contains the
- >>> # padding token ('') and OOV token ('[UNK]') as well as the passed tokens.
- >>> vectorize_layer.get_vocabulary()
- ['', '[UNK]', 'earth', 'wind', 'and', 'fire']
-
- """
- # TODO(momernick): Add an examples section to the docstring.
-
- def __init__(self,
- max_tokens=None,
- standardize=LOWER_AND_STRIP_PUNCTUATION,
- split=SPLIT_ON_WHITESPACE,
- ngrams=None,
- output_mode=INT,
- output_sequence_length=None,
- pad_to_max_tokens=False,
- vocabulary=None,
- **kwargs):
-
- # This layer only applies to string processing, and so should only have
- # a dtype of 'string'.
- if "dtype" in kwargs and kwargs["dtype"] != dtypes.string:
- raise ValueError("TextVectorization may only have a dtype of string.")
- elif "dtype" not in kwargs:
- kwargs["dtype"] = dtypes.string
-
- # 'standardize' must be one of (None, LOWER_AND_STRIP_PUNCTUATION, callable)
- layer_utils.validate_string_arg(
- standardize,
- allowable_strings=(LOWER_AND_STRIP_PUNCTUATION),
- layer_name="TextVectorization",
- arg_name="standardize",
- allow_none=True,
- allow_callables=True)
-
- # 'split' must be one of (None, SPLIT_ON_WHITESPACE, callable)
- layer_utils.validate_string_arg(
- split,
- allowable_strings=(SPLIT_ON_WHITESPACE),
- layer_name="TextVectorization",
- arg_name="split",
- allow_none=True,
- allow_callables=True)
-
- # Support deprecated names for output_modes.
- if output_mode == "binary":
- output_mode = MULTI_HOT
- if output_mode == "tf-idf":
- output_mode = TF_IDF
- # 'output_mode' must be one of (None, INT, COUNT, MULTI_HOT, TF_IDF)
- layer_utils.validate_string_arg(
- output_mode,
- allowable_strings=(INT, COUNT, MULTI_HOT, TF_IDF),
- layer_name="TextVectorization",
- arg_name="output_mode",
- allow_none=True)
-
- # 'ngrams' must be one of (None, int, tuple(int))
- if not (ngrams is None or
- isinstance(ngrams, int) or
- isinstance(ngrams, tuple) and
- all(isinstance(item, int) for item in ngrams)):
- raise ValueError(("`ngrams` must be None, an integer, or a tuple of "
- "integers. Got %s") % (ngrams,))
-
- # 'output_sequence_length' must be one of (None, int) and is only
- # set if output_mode is INT.
- if (output_mode == INT and not (isinstance(output_sequence_length, int) or
- (output_sequence_length is None))):
- raise ValueError("`output_sequence_length` must be either None or an "
- "integer when `output_mode` is 'int'. "
- "Got %s" % output_sequence_length)
-
- if output_mode != INT and output_sequence_length is not None:
- raise ValueError("`output_sequence_length` must not be set if "
- "`output_mode` is not 'int'.")
-
- self._max_tokens = max_tokens
- self._standardize = standardize
- self._split = split
- self._ngrams_arg = ngrams
- if isinstance(ngrams, int):
- self._ngrams = tuple(range(1, ngrams + 1))
- else:
- self._ngrams = ngrams
-
- self._output_mode = output_mode
- self._output_sequence_length = output_sequence_length
- vocabulary_size = 0
- # IndexLookup needs to keep track the current vocab size outside of its
- # layer weights. We persist it as a hidden part of the config during
- # serialization.
- if "vocabulary_size" in kwargs:
- vocabulary_size = kwargs["vocabulary_size"]
- del kwargs["vocabulary_size"]
-
- super(TextVectorization, self).__init__(
- combiner=None,
- **kwargs)
-
- self._index_lookup_layer = string_lookup.StringLookup(
- max_tokens=max_tokens,
- vocabulary=vocabulary,
- pad_to_max_tokens=pad_to_max_tokens,
- mask_token="",
- output_mode=output_mode if output_mode is not None else INT,
- vocabulary_size=vocabulary_size)
-
- def _assert_same_type(self, expected_type, values, value_name):
- if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
- raise RuntimeError("Expected %s type %s, got %s" %
- (value_name, expected_type, values.dtype))
-
- def compute_output_shape(self, input_shape):
- if self._output_mode != INT:
- return tensor_shape.TensorShape([input_shape[0], self._max_tokens])
-
- if self._output_mode == INT and self._split is None:
- if len(input_shape) <= 1:
- input_shape = tuple(input_shape) + (1,)
- return tensor_shape.TensorShape(input_shape)
-
- if self._output_mode == INT and self._split is not None:
- input_shape = list(input_shape)
- if len(input_shape) <= 1:
- input_shape = input_shape + [self._output_sequence_length]
- else:
- input_shape[1] = self._output_sequence_length
- return tensor_shape.TensorShape(input_shape)
-
- def compute_output_signature(self, input_spec):
- output_shape = self.compute_output_shape(input_spec.shape.as_list())
- output_dtype = (dtypes.int64 if self._output_mode == INT
- else backend.floatx())
- return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
-
- def adapt(self, data, reset_state=True):
- """Fits the state of the preprocessing layer to the dataset.
-
- Overrides the default adapt method to apply relevant preprocessing to the
- inputs before passing to the combiner.
-
- Args:
- data: The data to train on. It can be passed either as a tf.data Dataset,
- as a NumPy array, a string tensor, or as a list of texts.
- reset_state: Optional argument specifying whether to clear the state of
- the layer at the start of the call to `adapt`. This must be True for
- this layer, which does not support repeated calls to `adapt`.
- """
- if not reset_state:
- raise ValueError("TextVectorization does not support streaming adapts.")
-
- # Build the layer explicitly with the original data shape instead of relying
- # on an implicit call to `build` in the base layer's `adapt`, since
- # preprocessing changes the input shape.
- if isinstance(data, (list, tuple, np.ndarray)):
- data = ops.convert_to_tensor_v2_with_dispatch(data)
-
- if isinstance(data, ops.Tensor):
- if data.shape.rank == 1:
- data = array_ops.expand_dims(data, axis=-1)
- self.build(data.shape)
- preprocessed_inputs = self._preprocess(data)
- elif isinstance(data, dataset_ops.DatasetV2):
- # TODO(momernick): Replace this with a more V2-friendly API.
- shape = dataset_ops.get_legacy_output_shapes(data)
- if not isinstance(shape, tensor_shape.TensorShape):
- raise ValueError("The dataset passed to 'adapt' must contain a single "
- "tensor value.")
- if shape.rank == 0:
- data = data.map(lambda tensor: array_ops.expand_dims(tensor, 0))
- shape = dataset_ops.get_legacy_output_shapes(data)
- if shape.rank == 1:
- data = data.map(lambda tensor: array_ops.expand_dims(tensor, -1))
- self.build(dataset_ops.get_legacy_output_shapes(data))
- preprocessed_inputs = data.map(self._preprocess)
- else:
- raise ValueError(
- "adapt() requires a Dataset or an array as input, got {}".format(
- type(data)))
-
- self._index_lookup_layer.adapt(preprocessed_inputs)
-
- def get_vocabulary(self, include_special_tokens=True):
- """Returns the current vocabulary of the layer.
-
- Args:
- include_special_tokens: If True, the returned vocabulary will include
- the padding and OOV tokens, and a term's index in the vocabulary will
- equal the term's index when calling the layer. If False, the returned
- vocabulary will not include any padding or OOV tokens.
- """
- return self._index_lookup_layer.get_vocabulary(include_special_tokens)
-
- def vocabulary_size(self):
- """Gets the current size of the layer's vocabulary.
-
- Returns:
- The integer size of the voculary, including optional mask and oov indices.
- """
- return self._index_lookup_layer.vocabulary_size()
-
- def get_config(self):
- # This does not include the 'vocabulary' arg, since if the vocab was passed
- # at init time it's now stored in variable state - we don't need to
- # pull it off disk again.
- config = {
- "max_tokens": self._index_lookup_layer.max_tokens,
- "standardize": self._standardize,
- "split": self._split,
- "ngrams": self._ngrams_arg,
- "output_mode": self._output_mode,
- "output_sequence_length": self._output_sequence_length,
- "pad_to_max_tokens": self._index_lookup_layer.pad_to_max_tokens,
- "vocabulary_size": self._index_lookup_layer.vocabulary_size(),
- }
- base_config = super(TextVectorization, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
-
- def count_params(self):
- # This method counts the number of scalars in the weights of this layer.
- # Since this layer doesn't have any /actual/ weights (in that there's
- # nothing in this layer that can be trained - we only use the weight
- # abstraction for ease of saving!) we return 0.
- return 0
-
- def set_vocabulary(self, vocabulary, idf_weights=None):
- """Sets vocabulary (and optionally document frequency) data for this layer.
-
- This method sets the vocabulary and idf weights for this layer directly,
- instead of analyzing a dataset through 'adapt'. It should be used whenever
- the vocab (and optionally document frequency) information is already known.
- If vocabulary data is already present in the layer, this method will replace
- it.
-
- Args:
- vocabulary: An array of string tokens, or a path to a file containing one
- token per line.
- idf_weights: An array of document frequency data with equal length to
- vocab. Only necessary if the layer output_mode is TF_IDF.
-
- Raises:
- ValueError: If there are too many inputs, the inputs do not match, or
- input data is missing.
- RuntimeError: If the vocabulary cannot be set when this function is
- called. This happens when `"multi_hot"`, `"count"`, and "tfidf" modes,
- if `pad_to_max_tokens` is False and the layer itself has already been
- called.
- """
- self._index_lookup_layer.set_vocabulary(vocabulary, idf_weights=idf_weights)
-
- def build(self, input_shape):
- # We have to use 'and not ==' here, because input_shape[1] !/== 1 can result
- # in None for undefined shape axes. If using 'and !=', this causes the
- # expression to evaluate to False instead of True if the shape is undefined;
- # the expression needs to evaluate to True in that case.
- if self._split is not None:
- if input_shape.ndims > 1 and not input_shape[-1] == 1: # pylint: disable=g-comparison-negation
- raise RuntimeError(
- "When using TextVectorization to tokenize strings, the innermost "
- "dimension of the input array must be 1, got shape "
- "{}".format(input_shape))
-
- super(TextVectorization, self).build(input_shape)
-
- def _set_state_variables(self, updates):
- if not self.built:
- raise RuntimeError("_set_state_variables() must be called after build().")
- if self._output_mode == TF_IDF:
- self.set_vocabulary(updates[_VOCAB_NAME], idf_weights=updates[_IDF_NAME])
- else:
- self.set_vocabulary(updates[_VOCAB_NAME])
-
- def _preprocess(self, inputs):
- if self._standardize == LOWER_AND_STRIP_PUNCTUATION:
- if tf_utils.is_ragged(inputs):
- lowercase_inputs = ragged_functional_ops.map_flat_values(
- gen_string_ops.string_lower, inputs)
- # Depending on configuration, we may never touch the non-data tensor
- # in the ragged inputs tensor. If that is the case, and this is the
- # only layer in the keras model, running it will throw an error.
- # To get around this, we wrap the result in an identity.
- lowercase_inputs = array_ops.identity(lowercase_inputs)
- else:
- lowercase_inputs = gen_string_ops.string_lower(inputs)
- inputs = string_ops.regex_replace(lowercase_inputs, DEFAULT_STRIP_REGEX,
- "")
- elif callable(self._standardize):
- inputs = self._standardize(inputs)
- elif self._standardize is not None:
- raise ValueError(("%s is not a supported standardization. "
- "TextVectorization supports the following options "
- "for `standardize`: None, "
- "'lower_and_strip_punctuation', or a "
- "Callable.") % self._standardize)
-
- if self._split is not None:
- # If we are splitting, we validate that the 1st axis is of dimension 1 and
- # so can be squeezed out. We do this here instead of after splitting for
- # performance reasons - it's more expensive to squeeze a ragged tensor.
- if inputs.shape.ndims > 1:
- inputs = array_ops.squeeze(inputs, axis=-1)
- if self._split == SPLIT_ON_WHITESPACE:
- # This treats multiple whitespaces as one whitespace, and strips leading
- # and trailing whitespace.
- inputs = ragged_string_ops.string_split_v2(inputs)
- elif callable(self._split):
- inputs = self._split(inputs)
- else:
- raise ValueError(
- ("%s is not a supported splitting."
- "TextVectorization supports the following options "
- "for `split`: None, 'whitespace', or a Callable.") % self._split)
-
- # Note that 'inputs' here can be either ragged or dense depending on the
- # configuration choices for this Layer. The strings.ngrams op, however, does
- # support both ragged and dense inputs.
- if self._ngrams is not None:
- inputs = ragged_string_ops.ngrams(
- inputs, ngram_width=self._ngrams, separator=" ")
-
- return inputs
-
- def call(self, inputs):
- if isinstance(inputs, (list, tuple, np.ndarray)):
- inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
-
- inputs = self._preprocess(inputs)
-
- # If we're not doing any output processing, return right away.
- if self._output_mode is None:
- return inputs
-
- lookup_data = self._index_lookup_layer(inputs)
- if self._output_mode == INT:
-
- # Maybe trim the output (NOOP if self._output_sequence_length is None).
- output_tensor = lookup_data[..., :self._output_sequence_length]
-
- output_shape = output_tensor.shape.as_list()
- output_shape[-1] = self._output_sequence_length
-
- # If it is a ragged tensor, convert it to dense with correct shape.
- if tf_utils.is_ragged(output_tensor):
- return output_tensor.to_tensor(default_value=0, shape=output_shape)
-
- if self._output_sequence_length is None:
- return output_tensor
-
- padding, _ = array_ops.required_space_to_batch_paddings(
- output_tensor.shape, output_shape)
- return array_ops.pad(output_tensor, padding)
-
- return lookup_data
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
deleted file mode 100644
index c71d3c5..0000000
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Distribution tests for keras.layers.preprocessing.text_vectorization."""
-
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations as ds_combinations
-from tensorflow.python.distribute import multi_process_runner
-from tensorflow.python.framework import config
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_combinations as combinations
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.distribute import strategy_combinations
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.layers.preprocessing import text_vectorization
-from tensorflow.python.platform import test
-
-
-@ds_combinations.generate(
- combinations.combine(
- strategy=strategy_combinations.all_strategies +
- strategy_combinations.multi_worker_mirrored_strategies,
- mode=["eager"]))
-class TextVectorizationDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_distribution_strategy_output(self, strategy):
- # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
- if backend.is_tpu_strategy(strategy):
- self.skipTest("This test needs MLIR bridge on TPU.")
-
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
- 2, drop_remainder=True)
-
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_distribution_strategy_output_with_adapt(self, strategy):
- # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
- if backend.is_tpu_strategy(strategy):
- self.skipTest("This test needs MLIR bridge on TPU.")
- if test.is_built_with_rocm():
- self.skipTest("MultiworkerMirroredGPU2x fails with ROCm")
- vocab_data = [[
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ]]
- vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
- 2, drop_remainder=True)
-
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- config.set_soft_device_placement(True)
-
- with strategy.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.adapt(vocab_dataset)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_dataset)
- self.assertAllEqual(expected_output, output_dataset)
-
-if __name__ == "__main__":
- v2_compat.enable_v2_behavior()
- multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
deleted file mode 100644
index e985cf0..0000000
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ /dev/null
@@ -1,1768 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras text vectorization preprocessing layer."""
-
-import gc
-import os
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python import keras
-from tensorflow.python import tf2
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import one_device_strategy
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras import backend
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers import convolutional
-from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers import embeddings
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
-from tensorflow.python.keras.layers.preprocessing import text_vectorization
-from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.ops import gen_string_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_string_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-def _get_end_to_end_test_cases():
- test_cases = (
- {
- "testcase_name":
- "test_simple_tokens_int_mode",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- "standardize": None,
- "split": None,
- "output_mode": text_vectorization.INT
- },
- "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
- },
- {
- "testcase_name":
- "test_simple_tokens_int_mode_hard_cap",
- # Create an array where 'earth' is the most frequent term, followed by
- # 'wind', then 'and', then 'fire'. This ensures that the vocab
- # is sorting by frequency.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": 6,
- "standardize": None,
- "split": None,
- "output_mode": text_vectorization.INT
- },
- "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
- },
- {
- "testcase_name":
- "test_special_tokens_int_mode",
- # Mask tokens in the vocab data should be ingored, and mapped to 0 in
- # from the input data.
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- [""], [""], [""], ["[UNK]"], ["[UNK]"], ["[UNK]"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], [""], ["wind"], ["[UNK]"], ["and"], [""],
- ["fire"], ["and"], ["[UNK]"], ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- "standardize": None,
- "split": None,
- "output_mode": text_vectorization.INT
- },
- "expected_output": [[2], [0], [3], [1], [4], [0], [5], [4], [1], [1]],
- },
- {
- "testcase_name":
- "test_documents_int_mode",
- "vocab_data":
- np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
- ["and wind and"]]),
- "input_data":
- np.array([["earth wind and"], ["fire fire"], ["and earth"],
- ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- "standardize": None,
- "split": text_vectorization.SPLIT_ON_WHITESPACE,
- "output_mode": text_vectorization.INT
- },
- "expected_output": [[2, 3, 4], [5, 5, 0], [4, 2, 0], [1, 0, 0]],
- },
- {
- "testcase_name":
- "test_documents_1d_input_int_mode",
- "vocab_data":
- np.array([
- "fire earth earth", "earth earth", "wind wind", "and wind and"
- ]),
- "input_data":
- np.array([["earth wind and"], ["fire fire"], ["and earth"],
- ["michigan"]]),
- "kwargs": {
- "max_tokens": None,
- "standardize": None,
- "split": text_vectorization.SPLIT_ON_WHITESPACE,
- "output_mode": text_vectorization.INT
- },
- "expected_output": [[2, 3, 4], [5, 5, 0], [4, 2, 0], [1, 0, 0]],
- },
- {
- "testcase_name":
- "test_simple_tokens_binary_mode",
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "standardize": None,
- "split": None,
- "output_mode": text_vectorization.MULTI_HOT
- },
- "expected_output": [[0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0],
- [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0],
- [0, 1, 0, 0, 0], [1, 0, 0, 0, 0]],
- },
- {
- "testcase_name":
- "test_documents_binary_mode",
- "vocab_data":
- np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
- ["and wind and"]]),
- "input_data":
- np.array([["earth wind"], ["and"], ["fire fire"],
- ["earth michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "standardize": None,
- "split": text_vectorization.SPLIT_ON_WHITESPACE,
- "output_mode": text_vectorization.MULTI_HOT
- },
- "expected_output": [[0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1],
- [1, 1, 0, 0, 0]],
- },
- {
- "testcase_name":
- "test_simple_tokens_count_mode",
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "standardize": None,
- "split": None,
- "output_mode": text_vectorization.COUNT
- },
- "expected_output": [[0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0],
- [0, 0, 0, 0, 1], [0, 0, 0, 0, 1], [0, 0, 0, 1, 0],
- [0, 1, 0, 0, 0], [1, 0, 0, 0, 0]],
- },
- {
- "testcase_name":
- "test_documents_count_mode",
- "vocab_data":
- np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
- ["and wind and"]]),
- "input_data":
- np.array([["earth wind"], ["and"], ["fire fire"],
- ["earth michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "standardize": None,
- "split": text_vectorization.SPLIT_ON_WHITESPACE,
- "output_mode": text_vectorization.COUNT
- },
- "expected_output": [[0, 1, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 2],
- [1, 1, 0, 0, 0]],
- },
- {
- "testcase_name":
- "test_tokens_idf_mode",
- "vocab_data":
- np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
- ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
- "input_data":
- np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
- ["and"], ["earth"], ["michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "standardize": None,
- "split": None,
- "output_mode": text_vectorization.TF_IDF
- },
- "expected_output": [[0, 1.098612, 0, 0, 0], [0, 0, 1.252763, 0, 0],
- [0, 0, 0, 1.466337, 0], [0, 0, 0, 0, 1.7917595],
- [0, 0, 0, 0, 1.7917595], [0, 0, 0, 1.4663371, 0],
- [0, 1.098612, 0, 0, 0], [1.402368, 0, 0, 0, 0]],
- },
- {
- "testcase_name":
- "test_documents_idf_mode",
- "vocab_data":
- np.array([["fire earth earth"], ["earth earth"], ["wind wind"],
- ["and wind and"]]),
- "input_data":
- np.array([["earth wind"], ["and"], ["fire fire"],
- ["earth michigan"]]),
- "kwargs": {
- "max_tokens": 5,
- "standardize": None,
- "split": text_vectorization.SPLIT_ON_WHITESPACE,
- "output_mode": text_vectorization.TF_IDF
- },
- "expected_output": [[0., 0.847298, 0.847298, 0., 0.],
- [0., 0., 0., 1.098612, 0.],
- [0., 0., 0., 0., 2.197225],
- [0.972955, 0.847298, 0., 0., 0.]],
- },
- )
-
- crossed_test_cases = []
- # Cross above test cases with use_dataset in (True, False)
- for use_dataset in (True, False):
- for case in test_cases:
- case = case.copy()
- if use_dataset:
- case["testcase_name"] = case["testcase_name"] + "_with_dataset"
- case["use_dataset"] = use_dataset
- crossed_test_cases.append(case)
-
- return crossed_test_cases
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationLayerTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- @parameterized.named_parameters(*_get_end_to_end_test_cases())
- def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
- use_dataset, expected_output):
- cls = text_vectorization.TextVectorization
- if kwargs.get("output_mode") == text_vectorization.INT:
- expected_output_dtype = dtypes.int64
- else:
- expected_output_dtype = dtypes.float32
- input_shape = input_data.shape
-
- if use_dataset:
- # Keras APIs expect batched datasets.
- # TODO(rachelim): `model.predict` predicts the result on each
- # dataset batch separately, then tries to concatenate the results
- # together. When the results have different shapes on the non-concat
- # axis (which can happen in the output_mode = INT case for
- # TextVectorization), the concatenation fails. In real use cases, this may
- # not be an issue because users are likely to pipe the preprocessing layer
- # into other keras layers instead of predicting it directly. A workaround
- # for these unit tests is to have the dataset only contain one batch, so
- # no concatenation needs to happen with the result. For consistency with
- # numpy input, we should make `predict` join differently shaped results
- # together sensibly, with 0 padding.
- input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
- input_shape[0])
- vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
- input_shape[0])
-
- output_data = testing_utils.layer_test(
- cls,
- kwargs=kwargs,
- input_shape=input_shape,
- input_data=input_data,
- input_dtype=dtypes.string,
- expected_output_dtype=expected_output_dtype,
- validate_training=False,
- adapt_data=vocab_data)
- self.assertAllClose(expected_output, output_data)
-
- def test_scalar_input_int_mode_no_len_limit(self):
- vocab_data = [
- "fire earth earth", "earth earth", "wind wind", "and wind and"
- ]
- input_data = "earth wind and fire fire and earth michigan"
- layer = text_vectorization.TextVectorization()
- layer.adapt(vocab_data)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
- layer.set_vocabulary(["earth", "wind", "and", "fire"])
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1])
-
- def test_scalar_input_int_mode_trim_to_len_limit(self):
- vocab_data = [
- "fire earth earth", "earth earth", "wind wind", "and wind and"
- ]
- input_data = "earth wind and fire fire and earth michigan"
- layer = text_vectorization.TextVectorization(output_sequence_length=3)
- layer.adapt(vocab_data)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [2, 3, 4])
- layer.set_vocabulary(["earth", "wind", "and", "fire"])
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [2, 3, 4])
-
- def test_scalar_input_int_pad_to_len_limit(self):
- vocab_data = [
- "fire earth earth", "earth earth", "wind wind", "and wind and"
- ]
- input_data = "earth wind and fire fire and earth michigan"
- layer = text_vectorization.TextVectorization(output_sequence_length=10)
- layer.adapt(vocab_data)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
- layer.set_vocabulary(["earth", "wind", "and", "fire"])
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [2, 3, 4, 5, 5, 4, 2, 1, 0, 0])
-
- def test_list_inputs_1d(self):
- vocab_data = ["two two two", "two three three", "three four four five"]
- input_data = ["two three", "four five"]
- layer = text_vectorization.TextVectorization()
- layer.adapt(vocab_data)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
- layer.set_vocabulary(["two", "three", "four", "five"])
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
-
- def test_tensor_inputs(self):
- vocab_data = constant_op.constant(
- ["two two two", "two three three", "three four four five"])
- input_data = constant_op.constant(["two three", "four five"])
- layer = text_vectorization.TextVectorization()
- layer.adapt(vocab_data)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
- layer.set_vocabulary(["two", "three", "four", "five"])
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
-
- def test_list_inputs_2d(self):
- vocab_data = [
- ["two two two"], ["two three three"], ["three four four five"]]
- input_data = [["two three"], ["four five"]]
- layer = text_vectorization.TextVectorization()
- layer.adapt(vocab_data)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
- layer.set_vocabulary(["two", "three", "four", "five"])
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
-
- def test_dataset_of_single_strings(self):
- vocab_data = ["two two two", "two three three", "three four four five"]
- input_data = ["two three", "four five"]
- vocab_ds = dataset_ops.Dataset.from_tensor_slices(vocab_data) # unbatched
- layer = text_vectorization.TextVectorization()
- layer.adapt(vocab_ds)
- out = layer(input_data)
- if context.executing_eagerly():
- self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
-
- @parameterized.named_parameters(
- {
- "testcase_name": "1d",
- "data": ["0", "a", "b", "c", "d", "e", "a", "b", "c", "d", "f"],
- "expected": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]
- },
- {
- "testcase_name": "2d",
- "data": [["0", "a", "b", "c", "d"], ["e", "a", "b", "c", "d"], ["f"]],
- "expected": [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 0, 0, 0, 0]]
- },
- {
- "testcase_name":
- "3d",
- "data": [[["0", "a", "b"], ["c", "d"]], [["e", "a"], ["b", "c", "d"]],
- [["f"]]],
- "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
- [[1, 0, 0], [0, 0, 0]]]
- },
- )
- def test_layer_dimensionality_handling(self, data, expected):
- vocab = ["a", "b", "c", "d"]
- vectorization = text_vectorization.TextVectorization(
- max_tokens=None, standardize=None, split=None, pad_to_max_tokens=False)
- vectorization.set_vocabulary(vocab)
- output = vectorization(ragged_factory_ops.constant(data))
- self.assertAllEqual(expected, output)
-
- @parameterized.named_parameters(
- {
- "testcase_name": "1d",
- "data": ["0 a b c d e a b c d f"],
- "expected": [[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]]
- },
- {
- "testcase_name":
- "3d",
- "data": [[["0 a b"], ["c d"]], [["e a"], ["b c d"]], [["f"]]],
- "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
- [[1, 0, 0], [0, 0, 0]]]
- },
- )
- def test_layer_dimensionality_handling_with_split(self, data, expected):
- vocab = ["a", "b", "c", "d"]
- vectorization = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- pad_to_max_tokens=False)
- vectorization.set_vocabulary(vocab)
- output = vectorization(ragged_factory_ops.constant(data, inner_shape=(1,)))
- self.assertAllEqual(expected, output)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationPreprocessingTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def _write_to_temp_file(self, file_name, vocab_list):
- vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
- with gfile.GFile(vocab_path, "w") as writer:
- for vocab in vocab_list:
- writer.write(vocab + "\n")
- writer.flush()
- writer.close()
- return vocab_path
-
- def test_summary_before_adapt(self):
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
- split=None,
- ngrams=None,
- output_mode=text_vectorization.TF_IDF)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- # We are testing that model.summary() can be called without erroring out.
- # (b/145726907)
- model.summary()
-
- def test_normalization(self):
- input_array = np.array([["Earth", "wInD", "aNd", "firE"],
- ["fire|", "an<>d", "{earth}", "michigan@%$"]])
- expected_output = np.array([[b"earth", b"wind", b"and", b"fire"],
- [b"fire", b"and", b"earth", b"michigan"]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
- split=None,
- ngrams=None,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_normalization_ragged_inputs(self):
- input_array = ragged_factory_ops.constant([["Earth", "wInD", "aNd", "firE"],
- ["fire|", "an<>d", "{earth}"]])
- expected_output = [[b"earth", b"wind", b"and", b"fire"],
- [b"fire", b"and", b"earth"]]
-
- input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
- split=None,
- ngrams=None,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_custom_normalization(self):
- input_array = np.array([["Earth", "wInD", "aNd", "firE"],
- ["fire|", "an<>d", "{earth}", "michigan@%$"]])
- expected_output = np.array(
- [[b"earth", b"wind", b"and", b"fire"],
- [b"fire|", b"an<>d", b"{earth}", b"michigan@%$"]])
-
- custom_standardization = gen_string_ops.string_lower
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=custom_standardization,
- split=None,
- ngrams=None,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_string_splitting(self):
- input_array = np.array([["earth wind and fire"],
- ["\tfire\tand\nearth michigan "]])
- expected_output = [[b"earth", b"wind", b"and", b"fire"],
- [b"fire", b"and", b"earth", b"michigan"]]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- ngrams=None,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_custom_string_splitting(self):
- input_array = np.array([["earth>wind>and fire"],
- ["\tfire>and\nearth>michigan"]])
- expected_output = [[b"earth", b"wind", b"and fire"],
- [b"\tfire", b"and\nearth", b"michigan"]]
-
- custom_split = lambda x: ragged_string_ops.string_split_v2(x, sep=">")
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=custom_split,
- ngrams=None,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_single_ngram_value_ragged_inputs(self):
- input_array = ragged_factory_ops.constant([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth"]])
- # pyformat: disable
- expected_output = [[b"earth", b"wind", b"and", b"fire",
- b"earth wind", b"wind and", b"and fire",
- b"earth wind and", b"wind and fire"],
- [b"fire", b"and", b"earth",
- b"fire and", b"and earth",
- b"fire and earth"]]
- # pyformat: enable
-
- input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- ngrams=3,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_single_ngram_value(self):
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- # pyformat: disable
- expected_output = [[b"earth", b"wind", b"and", b"fire",
- b"earth wind", b"wind and", b"and fire",
- b"earth wind and", b"wind and fire"],
- [b"fire", b"and", b"earth", b"michigan",
- b"fire and", b"and earth", b"earth michigan",
- b"fire and earth", b"and earth michigan"]]
- # pyformat: enable
-
- input_data = keras.Input(shape=(4,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- ngrams=3,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_multiple_ngram_values(self):
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- # pyformat: disable
- expected_output = [[b"earth wind", b"wind and", b"and fire",
- b"earth wind and", b"wind and fire"],
- [b"fire and", b"and earth", b"earth michigan",
- b"fire and earth", b"and earth michigan"]]
- # pyformat: enable
-
- input_data = keras.Input(shape=(4,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- ngrams=(2, 3),
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_string_multiple_preprocessing_steps(self):
- input_array = np.array([["earth wInD and firE"],
- ["\tfire\tand\nearth!! michig@n "]])
- expected_output = [[
- b"earth",
- b"wind",
- b"and",
- b"fire",
- b"earth wind",
- b"wind and",
- b"and fire",
- ],
- [
- b"fire",
- b"and",
- b"earth",
- b"michign",
- b"fire and",
- b"and earth",
- b"earth michign",
- ]]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=text_vectorization.LOWER_AND_STRIP_PUNCTUATION,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- ngrams=2,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_string_splitting_with_non_1d_array_fails(self):
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=None)
- with self.assertRaisesRegex(RuntimeError,
- ".*tokenize strings, the innermost dime.*"):
- _ = layer(input_data)
-
- def test_string_splitting_with_non_1d_raggedarray_fails(self):
- input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- vocabulary=["a"],
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=None)
- with self.assertRaisesRegex(RuntimeError,
- ".*tokenize strings, the innermost dime.*"):
- _ = layer(input_data)
-
- def test_standardization_with_invalid_standardize_arg(self):
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(vocabulary=["a"])
- layer._standardize = "unsupported"
- with self.assertRaisesRegex(ValueError,
- ".*is not a supported standardization.*"):
- _ = layer(input_data)
-
- def test_splitting_with_invalid_split_arg(self):
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(vocabulary=["a"])
- layer._split = "unsupported"
- with self.assertRaisesRegex(ValueError, ".*is not a supported splitting.*"):
- _ = layer(input_data)
-
- def test_vocab_setting_via_init(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT,
- vocabulary=vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_vocab_setting_via_init_file(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT,
- vocabulary=vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_vocab_setting_via_setter(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_vocab_setting_with_oov_via_setter(self):
- vocab_data = ["", "[UNK]", "earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- vocab_path = self._write_to_temp_file("vocab_file", vocab_data)
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_path)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationDistributionTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_distribution_strategy_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
- with strategy.scope():
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationOutputTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_int_output(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_densifies_with_zeros(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # Create an input array that has 5 elements in the first example and 4 in
- # the second. This should output a 2x5 tensor with a padding value in the
- # second example.
- input_array = np.array([["earth wind and also fire"],
- ["fire and earth michigan"]])
- expected_output = [[2, 3, 4, 1, 5], [5, 4, 2, 1, 0]]
-
- # This test doesn't explicitly set an output shape, so the 2nd dimension
- # should stay 'None'.
- expected_output_shape = [None, None]
-
- # The input shape here is explicitly 1 because we're tokenizing.
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_densifies_with_zeros_and_pads(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # Create an input array that has 5 elements in the first example and 4 in
- # the second. This should output a 2x6 tensor with a padding value in the
- # second example, since output_sequence_length is set to 6.
- input_array = np.array([["earth wind and also fire"],
- ["fire and earth michigan"]])
- expected_output = [[2, 3, 4, 1, 5, 0], [5, 4, 2, 1, 0, 0]]
-
- output_sequence_length = 6
- expected_output_shape = [None, output_sequence_length]
-
- # The input shape here is explicitly 1 because we're tokenizing.
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=text_vectorization.INT,
- output_sequence_length=output_sequence_length)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_densifies_with_zeros_and_strips(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # Create an input array that has 5 elements in the first example and 4 in
- # the second. This should output a 2x3 tensor with a padding value in the
- # second example, since output_sequence_length is set to 3.
- input_array = np.array([["earth wind and also fire"],
- ["fire and earth michigan"]])
- expected_output = [[2, 3, 4], [5, 4, 2]]
- output_sequence_length = 3
- expected_output_shape = [None, output_sequence_length]
-
- # The input shape here is explicitly 1 because we're tokenizing.
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=text_vectorization.INT,
- output_sequence_length=output_sequence_length)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_int_output_dynamically_strips_and_pads(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # Create an input array that has 5 elements in the first example and 4 in
- # the second. This should output a 2x3 tensor with a padding value in the
- # second example, since output_sequence_length is set to 3.
- input_array = np.array([["earth wind and also fire"],
- ["fire and earth michigan"]])
- expected_output = [[2, 3, 4], [5, 4, 2]]
- output_sequence_length = 3
- expected_output_shape = [None, output_sequence_length]
-
- # The input shape here is explicitly 1 because we're tokenizing.
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=text_vectorization.INT,
- output_sequence_length=output_sequence_length)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- # Create an input array that has 1 element in the first example and 2 in
- # the second. This should output a 2x3 tensor with a padding value in the
- # second example, since output_sequence_length is set to 3.
- input_array_2 = np.array([["wind"], ["fire and"]])
- expected_output_2 = [[3, 0, 0], [5, 4, 0]]
- output_dataset = model.predict(input_array_2)
- self.assertAllEqual(expected_output_2, output_dataset)
-
- def test_binary_output_hard_maximum(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0, 0],
- [1, 1, 0, 1, 0, 0]]
- # pyformat: enable
- max_tokens = 6
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=max_tokens,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=True)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_binary_output_soft_maximum(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0]]
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=False)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bag_output_hard_maximum_set_vocabulary_after_build(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0]]
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=max_tokens,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=True)
- int_data = layer(input_data)
- layer.set_vocabulary(vocab_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bag_output_hard_maximum_adapt_after_build(self):
- vocab_data = np.array([
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ])
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0]]
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=max_tokens,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=True)
- int_data = layer(input_data)
- layer.adapt(vocab_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bag_output_hard_maximum_set_state_variables_after_build(self):
- state_variables = {
- text_vectorization._VOCAB_NAME: ["earth", "wind", "and", "fire"]
- }
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0]]
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=max_tokens,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=True)
- int_data = layer(input_data)
- layer._set_state_variables(state_variables)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bag_output_hard_maximum_multiple_adapts(self):
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
- adapt_data = ["earth", "earth", "earth", "earth", "wind", "wind", "wind"]
- first_expected_output = [
- [1, 1, 1, 0, 0],
- [1, 1, 0, 0, 0],
- ]
- second_adapt_data = [
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ]
- second_expected_output = [
- [0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0],
- ]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=5,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=True)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- # Test the first adapt
- layer.adapt(adapt_data)
- first_output = model.predict(input_array)
- # Test the second adapt
- layer.adapt(second_adapt_data)
- second_output = model.predict(input_array)
- self.assertAllEqual(first_expected_output, first_output)
- self.assertAllEqual(second_expected_output, second_output)
-
- def test_bag_output_soft_maximum_set_state_after_build(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 1, 1, 1, 0],
- [1, 1, 0, 1, 0]]
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=False)
- layer.build(input_data.shape)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_bag_output_soft_maximum_set_vocabulary_after_call_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=False)
- layer.adapt(vocab_data)
- _ = layer(input_data)
- with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
- layer.set_vocabulary(vocab_data)
-
- def test_bag_output_soft_maximum_set_state_variables_after_call_fails(self):
- state_variables = {
- text_vectorization._VOCAB_NAME: ["earth", "wind", "and", "fire"]
- }
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT,
- pad_to_max_tokens=False)
- layer.adapt(["earth", "wind"])
- _ = layer(input_data)
- with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
- layer._set_state_variables(state_variables)
-
- def test_count_output_hard_maximum(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 2, 1, 1, 0, 0],
- [2, 1, 0, 1, 0, 0]]
- # pyformat: enable
- max_tokens = 6
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=6,
- standardize=None,
- split=None,
- output_mode=text_vectorization.COUNT,
- pad_to_max_tokens=True)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_count_output_soft_maximum(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[0, 2, 1, 1, 0],
- [2, 1, 0, 1, 0]]
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=None,
- split=None,
- output_mode=text_vectorization.COUNT,
- pad_to_max_tokens=False)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- def test_tfidf_output_hard_maximum(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
- idf_weights = [.4, .25, .75, .6]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "fire", "earth", "michigan"]])
-
- # pyformat: disable
- # pylint: disable=bad-whitespace
- expected_output = [[ 0, .8, .25, .75, 0, 0],
- [ 1, .4, 0, 0, .6, 0]]
- # pylint: enable=bad-whitespace
- # pyformat: enable
- max_tokens = 6
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=6,
- standardize=None,
- split=None,
- output_mode=text_vectorization.TF_IDF,
- pad_to_max_tokens=True)
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllClose(expected_output, output_dataset)
-
- def test_tfidf_output_soft_maximum(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
- idf_weights = [.4, .25, .75, .6]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "fire", "earth", "michigan"]])
-
- # pyformat: disable
- # pylint: disable=bad-whitespace
- expected_output = [[ 0, .8, .25, .75, 0],
- [ 1, .4, 0, 0, .6]]
- # pylint: enable=bad-whitespace
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=None,
- split=None,
- output_mode=text_vectorization.TF_IDF,
- pad_to_max_tokens=False)
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllClose(expected_output, output_dataset)
-
- def test_tfidf_output_set_oov_weight(self):
- vocab_data = ["[UNK]", "earth", "wind", "and", "fire"]
- idf_weights = [.1, .4, .25, .75, .6]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "fire", "earth", "michigan"]])
-
- # pyformat: disable
- # pylint: disable=bad-whitespace
- expected_output = [[ 0, .8, .25, .75, 0],
- [ .2, .4, 0, 0, .6]]
- # pylint: enable=bad-whitespace
- # pyformat: enable
- max_tokens = 5
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=None,
- split=None,
- output_mode=text_vectorization.TF_IDF,
- pad_to_max_tokens=False)
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
- int_data = layer(input_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllClose(expected_output, output_dataset)
-
- def test_accept_1D_input(self):
- input_array = np.array(["earth wind and fire",
- "fire and earth michigan"])
- layer = text_vectorization.TextVectorization(
- standardize=None, split=None, output_mode="int")
- layer.adapt(input_array)
- _ = layer(input_array)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationModelBuildingTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- @parameterized.named_parameters(
- {
- "testcase_name": "count_hard_max",
- "pad_to_max_tokens": True,
- "output_mode": text_vectorization.COUNT
- }, {
- "testcase_name": "count_soft_max",
- "pad_to_max_tokens": False,
- "output_mode": text_vectorization.COUNT
- }, {
- "testcase_name": "binary_hard_max",
- "pad_to_max_tokens": True,
- "output_mode": text_vectorization.MULTI_HOT
- }, {
- "testcase_name": "binary_soft_max",
- "pad_to_max_tokens": False,
- "output_mode": text_vectorization.MULTI_HOT
- }, {
- "testcase_name": "tfidf_hard_max",
- "pad_to_max_tokens": True,
- "output_mode": text_vectorization.TF_IDF
- }, {
- "testcase_name": "tfidf_soft_max",
- "pad_to_max_tokens": False,
- "output_mode": text_vectorization.TF_IDF
- })
- def test_end_to_end_bagged_modeling(self, output_mode, pad_to_max_tokens):
- vocab_data = ["earth", "wind", "and", "fire"]
- idf_weights = [.5, .25, .2, .125]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=10,
- standardize=None,
- split=None,
- output_mode=output_mode,
- pad_to_max_tokens=pad_to_max_tokens)
- if output_mode == text_vectorization.TF_IDF:
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
- else:
- layer.set_vocabulary(vocab_data)
-
- int_data = layer(input_data)
- float_data = backend.cast(int_data, dtype="float32")
- output_data = core.Dense(64)(float_data)
- model = keras.Model(inputs=input_data, outputs=output_data)
- _ = model.predict(input_array)
-
- def test_end_to_end_vocab_modeling(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth wind and also fire"],
- ["fire and earth michigan"]])
- output_sequence_length = 6
- max_tokens = 5
-
- # The input shape here is explicitly 1 because we're tokenizing.
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=text_vectorization.SPLIT_ON_WHITESPACE,
- output_mode=text_vectorization.INT,
- output_sequence_length=output_sequence_length)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- embedded_data = embeddings.Embedding(
- input_dim=max_tokens + 1, output_dim=32)(
- int_data)
- output_data = convolutional.Conv1D(
- 250, 3, padding="valid", activation="relu", strides=1)(
- embedded_data)
-
- model = keras.Model(inputs=input_data, outputs=output_data)
- _ = model.predict(input_array)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationVocbularyTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest,
-):
-
- def test_get_vocabulary(self):
- vocab = ["earth", "wind", "and", "fire"]
-
- layer = text_vectorization.TextVectorization(vocabulary=vocab)
- self.assertAllEqual(layer.get_vocabulary(),
- ["", "[UNK]", "earth", "wind", "and", "fire"])
-
- def test_get_vocabulary_adapt(self):
- vocab = np.array([["earth earth earth earth wind wind wind and and fire"]])
-
- layer = text_vectorization.TextVectorization()
- layer.adapt(vocab)
- self.assertAllEqual(layer.get_vocabulary(),
- ["", "[UNK]", "earth", "wind", "and", "fire"])
-
- def test_get_vocabulary_no_special_tokens(self):
- vocab = ["earth", "wind", "and", "fire"]
-
- layer = text_vectorization.TextVectorization(vocabulary=vocab)
- self.assertAllEqual(
- layer.get_vocabulary(include_special_tokens=False),
- ["earth", "wind", "and", "fire"])
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationErrorTest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest
- ):
-
- def test_too_long_vocab_fails_in_single_setting(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- layer = text_vectorization.TextVectorization(
- max_tokens=4,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- with self.assertRaisesRegex(ValueError,
- "vocabulary larger than the maximum vocab.*"):
- layer.set_vocabulary(vocab_data)
-
- def test_setting_vocab_without_idf_weights_fails_in_tfidf_mode(self):
- vocab_data = ["earth", "wind", "and", "fire"]
-
- layer = text_vectorization.TextVectorization(
- max_tokens=5,
- standardize=None,
- split=None,
- output_mode=text_vectorization.TF_IDF)
- with self.assertRaisesRegex(
- ValueError, "`idf_weights` must be set if output_mode is TF_IDF"):
- layer.set_vocabulary(vocab_data)
-
- def test_idf_weights_length_mismatch_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- idf_weights = [1, 2, 3]
- layer = text_vectorization.TextVectorization(
- max_tokens=5,
- standardize=None,
- split=None,
- output_mode=text_vectorization.TF_IDF)
- with self.assertRaisesRegex(
- ValueError, "`idf_weights` must be the same length as vocab"):
- layer.set_vocabulary(vocab_data, idf_weights)
-
- def test_set_tfidf_in_non_tfidf_fails(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- idf_weights = [1, 2, 3, 4]
- layer = text_vectorization.TextVectorization(
- max_tokens=5,
- standardize=None,
- split=None,
- output_mode=text_vectorization.MULTI_HOT)
- with self.assertRaisesRegex(ValueError,
- "`idf_weights` should only be set if"):
- layer.set_vocabulary(vocab_data, idf_weights)
-
- def test_zero_max_tokens_fails(self):
- with self.assertRaisesRegex(ValueError, "max_tokens.*"):
- _ = text_vectorization.TextVectorization(max_tokens=0)
-
- def test_non_string_dtype_fails(self):
- with self.assertRaisesRegex(ValueError, "dtype of string.*"):
- _ = text_vectorization.TextVectorization(dtype=dtypes.int64)
-
- def test_unknown_standardize_arg_fails(self):
- with self.assertRaisesRegex(ValueError,
- "standardize arg.*unsupported_value"):
- _ = text_vectorization.TextVectorization(standardize="unsupported_value")
-
- def test_unknown_split_arg_fails(self):
- with self.assertRaisesRegex(ValueError, "split arg.*unsupported_value"):
- _ = text_vectorization.TextVectorization(split="unsupported_value")
-
- def test_unknown_output_mode_arg_fails(self):
- with self.assertRaisesRegex(ValueError,
- "output_mode arg.*unsupported_value"):
- _ = text_vectorization.TextVectorization(output_mode="unsupported_value")
-
- def test_unknown_ngrams_arg_fails(self):
- with self.assertRaisesRegex(ValueError, "ngrams.*unsupported_value"):
- _ = text_vectorization.TextVectorization(ngrams="unsupported_value")
-
- def test_float_ngrams_arg_fails(self):
- with self.assertRaisesRegex(ValueError, "ngrams.*2.9"):
- _ = text_vectorization.TextVectorization(ngrams=2.9)
-
- def test_float_tuple_ngrams_arg_fails(self):
- with self.assertRaisesRegex(ValueError, "ngrams.*(1.3, 2.9)"):
- _ = text_vectorization.TextVectorization(ngrams=(1.3, 2.9))
-
- def test_non_int_output_sequence_length_dtype_fails(self):
- with self.assertRaisesRegex(ValueError, "output_sequence_length.*2.0"):
- _ = text_vectorization.TextVectorization(
- output_mode="int", output_sequence_length=2.0)
-
- def test_non_none_output_sequence_length_fails_if_output_type_not_int(self):
- with self.assertRaisesRegex(ValueError,
- "`output_sequence_length` must not be set"):
- _ = text_vectorization.TextVectorization(
- output_mode="count", output_sequence_length=2)
-
-
-# Custom functions for the custom callable serialization test. Declared here
-# to avoid multiple registrations from run_all_keras_modes().
-@generic_utils.register_keras_serializable(package="Test")
-def custom_standardize_fn(x):
- return gen_string_ops.string_lower(x)
-
-
-@generic_utils.register_keras_serializable(package="Test")
-def custom_split_fn(x):
- return ragged_string_ops.string_split_v2(x, sep=">")
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationSavingTest(
- keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def tearDown(self):
- keras.backend.clear_session()
- gc.collect()
- super(TextVectorizationSavingTest, self).tearDown()
-
- def test_saving(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-
- model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = keras.models.load_model(output_path)
- self.assertAllEqual(loaded_model.predict(input_array), expected_output)
-
- def test_saving_when_nested(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- input_array = np.array([["earth", "wind", "and", "fire"],
- ["fire", "and", "earth", "michigan"]])
- expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=None,
- split=None,
- output_mode=text_vectorization.INT)
- layer.set_vocabulary(vocab_data)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
-
- outer_input = keras.Input(shape=(None,), dtype=dtypes.string)
- outer_output = model(outer_input)
- outer_model = keras.Model(inputs=outer_input, outputs=outer_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- outer_model.save(output_path, save_format="tf")
-
- # Delete the session and graph to ensure that the loaded model is generated
- # from scratch.
- # TODO(b/149526183): Can't clear session when TF2 is disabled.
- if tf2.enabled():
- keras.backend.clear_session()
-
- loaded_model = keras.models.load_model(output_path)
- self.assertAllEqual(loaded_model.predict(input_array), expected_output)
-
- def test_saving_with_tfidf(self):
- vocab_data = ["earth", "wind", "and", "fire"]
- # OOV idf weight (bucket 0) should 0.5, the average of passed weights.
- idf_weights = [.4, .25, .75, .6]
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "fire", "earth", "michigan"]])
-
- # pyformat: disable
- # pylint: disable=bad-whitespace
- expected_output = [[ 0, .8, .25, .75, 0],
- [ 1, .4, 0, 0, .6]]
- vocab_data = ["earth", "wind", "and", "fire"]
- # pylint: enable=bad-whitespace
- # pyformat: enable
-
- # Build and validate a golden model.
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=5,
- standardize=None,
- split=None,
- output_mode=text_vectorization.TF_IDF)
- layer.set_vocabulary(vocab_data, idf_weights=idf_weights)
-
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllClose(output_dataset, expected_output)
-
- # Save the model to disk.
- output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
- model.save(output_path, save_format="tf")
- loaded_model = keras.models.load_model(output_path)
-
- # Ensure that the loaded model is unique (so that the save/load is real)
- self.assertIsNot(model, loaded_model)
-
- # Validate correctness of the new model.
- new_output_dataset = loaded_model.predict(input_array)
- self.assertAllClose(new_output_dataset, expected_output)
-
- def test_serialization_with_custom_callables(self):
- input_array = np.array([["earth>wind>and Fire"],
- ["\tfire>And\nearth>michigan"]])
- expected_output = [[b"earth", b"wind", b"and fire"],
- [b"\tfire", b"and\nearth", b"michigan"]]
-
- input_data = keras.Input(shape=(1,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=None,
- standardize=custom_standardize_fn,
- split=custom_split_fn,
- ngrams=None,
- output_mode=None)
- int_data = layer(input_data)
- model = keras.Model(inputs=input_data, outputs=int_data)
- output_dataset = model.predict(input_array)
- self.assertAllEqual(expected_output, output_dataset)
-
- serialized_model_data = model.get_config()
- new_model = keras.Model.from_config(serialized_model_data)
- new_output_dataset = new_model.predict(input_array)
- self.assertAllEqual(expected_output, new_output_dataset)
-
-
-@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class TextVectorizationE2ETest(keras_parameterized.TestCase,
- preprocessing_test_utils.PreprocessingLayerTest):
-
- def test_keras_vocab_trimming_example(self):
- vocab_data = np.array([
- "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
- "and", "fire"
- ])
- input_array = np.array([["earth", "wind", "and", "earth"],
- ["ohio", "and", "earth", "michigan"]])
-
- # pyformat: disable
- expected_output = [[1, 2, 1],
- [3, 1, 0]]
- # pyformat: enable
- max_tokens = 3
- expected_output_shape = [None, max_tokens]
-
- input_data = keras.Input(shape=(None,), dtype=dtypes.string)
- layer = text_vectorization.TextVectorization(
- max_tokens=max_tokens,
- standardize=None,
- split=None,
- output_mode=text_vectorization.COUNT,
- pad_to_max_tokens=True)
- int_data = layer(input_data)
- layer.adapt(vocab_data)
- self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
- model = keras.Model(input_data, int_data)
- output = model.predict(input_array)
- self.assertAllEqual(expected_output, output)
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index f623084..fe7f46a 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -43,15 +43,6 @@
from tensorflow.python.keras.layers.normalization import batch_normalization
from tensorflow.python.keras.layers.normalization import batch_normalization_v1
from tensorflow.python.keras.layers.normalization import layer_normalization
-from tensorflow.python.keras.layers.preprocessing import category_crossing
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.keras.layers.preprocessing import discretization
-from tensorflow.python.keras.layers.preprocessing import hashing
-from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.keras.layers.preprocessing import integer_lookup
-from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
-from tensorflow.python.keras.layers.preprocessing import string_lookup
-from tensorflow.python.keras.layers.preprocessing import text_vectorization
from tensorflow.python.keras.utils import generic_utils
from tensorflow.python.keras.utils import tf_inspect as inspect
from tensorflow.python.util.tf_export import keras_export
@@ -60,10 +51,7 @@
convolutional_recurrent, core, cudnn_recurrent, dense_attention,
embeddings, einsum_dense, local, merge, noise,
batch_normalization_v1, layer_normalization,
- pooling, image_preprocessing, recurrent, wrappers, hashing,
- category_crossing, category_encoding, discretization,
- multi_head_attention, integer_lookup,
- preprocessing_normalization, string_lookup, text_vectorization)
+ pooling, recurrent, wrappers, multi_head_attention)
ALL_V2_MODULES = (rnn_cell_wrapper_v2, batch_normalization, layer_normalization,
recurrent_v2)
# ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
diff --git a/tensorflow/python/keras/mixed_precision/layer_correctness_test.py b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
index 82349b5..e6a9e3f 100644
--- a/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
+++ b/tensorflow/python/keras/mixed_precision/layer_correctness_test.py
@@ -39,8 +39,6 @@
from tensorflow.python.keras.layers import wrappers
from tensorflow.python.keras.layers.normalization import batch_normalization
from tensorflow.python.keras.layers.normalization import layer_normalization
-from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.keras.layers.preprocessing import normalization
from tensorflow.python.keras.mixed_precision import policy
from tensorflow.python.platform import test
@@ -51,19 +49,6 @@
return mirrored_strategy.MirroredStrategy(['cpu:0', 'cpu:1'])
-def _create_normalization_layer_with_adapt():
- layer = normalization.Normalization()
- layer.adapt(np.random.normal(size=(10, 4)))
- return layer
-
-
-def _create_normalization_layer_without_adapt():
- return normalization.Normalization(
- mean=np.random.normal(size=(4,)),
- variance=np.random.uniform(0.5, 2., size=(4,))
- )
-
-
class LayerCorrectnessTest(keras_parameterized.TestCase):
def setUp(self):
@@ -159,13 +144,6 @@
lambda: dense_attention.AdditiveAttention(causal=True), [(2, 3, 4),
(2, 3, 4),
(2, 3, 4)]),
- ('NormalizationAdapt', _create_normalization_layer_with_adapt, (4, 4)),
- ('NormalizationNoAdapt', _create_normalization_layer_without_adapt,
- (4, 4)),
- ('Resizing', lambda: image_preprocessing.Resizing(3, 3), (2, 5, 5, 1)),
- ('Rescaling', lambda: image_preprocessing.Rescaling(2., 1.), (6, 6)),
- ('CenterCrop', lambda: image_preprocessing.CenterCrop(3, 3),
- (2, 5, 5, 1))
)
def test_layer(self, f32_layer_fn, input_shape, rtol=2e-3, atol=2e-3,
input_data=None):
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
deleted file mode 100644
index 876a2c3..0000000
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ /dev/null
@@ -1,162 +0,0 @@
-# Description:
-# Contains the Keras preprocessing layers (internal TensorFlow version).
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
-package(
- default_visibility = [
- "//tensorflow/python/keras:__subpackages__",
- ],
- licenses = ["notice"],
-)
-
-filegroup(
- name = "all_py_srcs",
- srcs = glob(["*.py"]),
- visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
-)
-
-py_library(
- name = "preprocessing",
- srcs = [
- "__init__.py",
- ],
- srcs_version = "PY3",
- deps = [
- ":image",
- ":sequence",
- ":text",
- ":timeseries",
- "//tensorflow/python/keras/utils:all_utils",
- ],
-)
-
-py_library(
- name = "image",
- srcs = [
- "dataset_utils.py",
- "image.py",
- "image_dataset.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:util",
- "//tensorflow/python/keras:backend",
- "//tensorflow/python/keras/utils:data_utils",
- ],
-)
-
-py_library(
- name = "sequence",
- srcs = [
- "sequence.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:util",
- "//tensorflow/python/keras/utils:data_utils",
- ],
-)
-
-py_library(
- name = "timeseries",
- srcs = [
- "timeseries.py",
- ],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python:array_ops",
- "//tensorflow/python:math_ops",
- "//tensorflow/python/data/ops:dataset_ops",
- "//third_party/py/numpy",
- ],
-)
-
-py_library(
- name = "text",
- srcs = [
- "dataset_utils.py",
- "text.py",
- "text_dataset.py",
- ],
- srcs_version = "PY3",
- deps = ["//tensorflow/python:util"],
-)
-
-tf_py_test(
- name = "image_test",
- size = "medium",
- srcs = ["image_test.py"],
- python_version = "PY3",
- deps = [
- ":image",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/compat:v2_compat",
- "//tensorflow/python/keras",
- "//third_party/py/numpy",
- ],
-)
-
-tf_py_test(
- name = "image_dataset_test",
- size = "small",
- srcs = ["image_dataset_test.py"],
- python_version = "PY3",
- deps = [
- ":image",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/compat:v2_compat",
- "//tensorflow/python/keras",
- "//third_party/py/numpy",
- ],
-)
-
-tf_py_test(
- name = "sequence_test",
- size = "small",
- srcs = ["sequence_test.py"],
- python_version = "PY3",
- deps = [
- ":sequence",
- "//tensorflow/python:client_testlib",
- "//third_party/py/numpy",
- ],
-)
-
-tf_py_test(
- name = "text_test",
- size = "small",
- srcs = ["text_test.py"],
- python_version = "PY3",
- deps = [
- ":text",
- "//tensorflow/python:client_testlib",
- "//third_party/py/numpy",
- ],
-)
-
-tf_py_test(
- name = "text_dataset_test",
- size = "small",
- srcs = ["text_dataset_test.py"],
- python_version = "PY3",
- deps = [
- ":text",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/compat:v2_compat",
- "//tensorflow/python/keras",
- ],
-)
-
-tf_py_test(
- name = "timeseries_test",
- size = "small",
- srcs = ["timeseries_test.py"],
- python_version = "PY3",
- deps = [
- ":timeseries",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python/compat:v2_compat",
- "//third_party/py/numpy",
- ],
-)
diff --git a/tensorflow/python/keras/preprocessing/__init__.py b/tensorflow/python/keras/preprocessing/__init__.py
deleted file mode 100644
index 42151dc..0000000
--- a/tensorflow/python/keras/preprocessing/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides keras data preprocessing utils to pre-process tf.data.Datasets before they are fed to the model."""
-# pylint: disable=g-import-not-at-top
-# TODO(mihaimaruseac): remove the import of keras_preprocessing and injecting
-# once we update to latest version of keras_preprocessing
-import keras_preprocessing
-
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.preprocessing import image
-from tensorflow.python.keras.preprocessing import sequence
-from tensorflow.python.keras.preprocessing import text
-from tensorflow.python.keras.preprocessing import timeseries
-from tensorflow.python.keras.utils import all_utils as utils
-
-# This exists for compatibility with prior version of keras_preprocessing.
-keras_preprocessing.set_keras_submodules(backend=backend, utils=utils)
diff --git a/tensorflow/python/keras/preprocessing/dataset_utils.py b/tensorflow/python/keras/preprocessing/dataset_utils.py
deleted file mode 100644
index a6fbaee..0000000
--- a/tensorflow/python/keras/preprocessing/dataset_utils.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras image dataset loading utilities."""
-# pylint: disable=g-classes-have-attributes
-
-import multiprocessing
-import os
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-
-def index_directory(directory,
- labels,
- formats,
- class_names=None,
- shuffle=True,
- seed=None,
- follow_links=False):
- """Make list of all files in the subdirs of `directory`, with their labels.
-
- Args:
- directory: The target directory (string).
- labels: Either "inferred"
- (labels are generated from the directory structure),
- None (no labels),
- or a list/tuple of integer labels of the same size as the number of
- valid files found in the directory. Labels should be sorted according
- to the alphanumeric order of the image file paths
- (obtained via `os.walk(directory)` in Python).
- formats: Allowlist of file extensions to index (e.g. ".jpg", ".txt").
- class_names: Only valid if "labels" is "inferred". This is the explict
- list of class names (must match names of subdirectories). Used
- to control the order of the classes
- (otherwise alphanumerical order is used).
- shuffle: Whether to shuffle the data. Default: True.
- If set to False, sorts the data in alphanumeric order.
- seed: Optional random seed for shuffling.
- follow_links: Whether to visits subdirectories pointed to by symlinks.
-
- Returns:
- tuple (file_paths, labels, class_names).
- file_paths: list of file paths (strings).
- labels: list of matching integer labels (same length as file_paths)
- class_names: names of the classes corresponding to these labels, in order.
- """
- if labels is None:
- # in the no-label case, index from the parent directory down.
- subdirs = ['']
- class_names = subdirs
- else:
- subdirs = []
- for subdir in sorted(os.listdir(directory)):
- if os.path.isdir(os.path.join(directory, subdir)):
- subdirs.append(subdir)
- if not class_names:
- class_names = subdirs
- else:
- if set(class_names) != set(subdirs):
- raise ValueError(
- 'The `class_names` passed did not match the '
- 'names of the subdirectories of the target directory. '
- 'Expected: %s, but received: %s' %
- (subdirs, class_names))
- class_indices = dict(zip(class_names, range(len(class_names))))
-
- # Build an index of the files
- # in the different class subfolders.
- pool = multiprocessing.pool.ThreadPool()
- results = []
- filenames = []
-
- for dirpath in (os.path.join(directory, subdir) for subdir in subdirs):
- results.append(
- pool.apply_async(index_subdirectory,
- (dirpath, class_indices, follow_links, formats)))
- labels_list = []
- for res in results:
- partial_filenames, partial_labels = res.get()
- labels_list.append(partial_labels)
- filenames += partial_filenames
- if labels not in ('inferred', None):
- if len(labels) != len(filenames):
- raise ValueError('Expected the lengths of `labels` to match the number '
- 'of files in the target directory. len(labels) is %s '
- 'while we found %s files in %s.' % (
- len(labels), len(filenames), directory))
- else:
- i = 0
- labels = np.zeros((len(filenames),), dtype='int32')
- for partial_labels in labels_list:
- labels[i:i + len(partial_labels)] = partial_labels
- i += len(partial_labels)
-
- if labels is None:
- print('Found %d files.' % (len(filenames),))
- else:
- print('Found %d files belonging to %d classes.' %
- (len(filenames), len(class_names)))
- pool.close()
- pool.join()
- file_paths = [os.path.join(directory, fname) for fname in filenames]
-
- if shuffle:
- # Shuffle globally to erase macro-structure
- if seed is None:
- seed = np.random.randint(1e6)
- rng = np.random.RandomState(seed)
- rng.shuffle(file_paths)
- rng = np.random.RandomState(seed)
- rng.shuffle(labels)
- return file_paths, labels, class_names
-
-
-def iter_valid_files(directory, follow_links, formats):
- walk = os.walk(directory, followlinks=follow_links)
- for root, _, files in sorted(walk, key=lambda x: x[0]):
- for fname in sorted(files):
- if fname.lower().endswith(formats):
- yield root, fname
-
-
-def index_subdirectory(directory, class_indices, follow_links, formats):
- """Recursively walks directory and list image paths and their class index.
-
- Args:
- directory: string, target directory.
- class_indices: dict mapping class names to their index.
- follow_links: boolean, whether to recursively follow subdirectories
- (if False, we only list top-level images in `directory`).
- formats: Allowlist of file extensions to index (e.g. ".jpg", ".txt").
-
- Returns:
- tuple `(filenames, labels)`. `filenames` is a list of relative file
- paths, and `labels` is a list of integer labels corresponding to these
- files.
- """
- dirname = os.path.basename(directory)
- valid_files = iter_valid_files(directory, follow_links, formats)
- labels = []
- filenames = []
- for root, fname in valid_files:
- labels.append(class_indices[dirname])
- absolute_path = os.path.join(root, fname)
- relative_path = os.path.join(
- dirname, os.path.relpath(absolute_path, directory))
- filenames.append(relative_path)
- return filenames, labels
-
-
-def get_training_or_validation_split(samples, labels, validation_split, subset):
- """Potentially restict samples & labels to a training or validation split.
-
- Args:
- samples: List of elements.
- labels: List of corresponding labels.
- validation_split: Float, fraction of data to reserve for validation.
- subset: Subset of the data to return.
- Either "training", "validation", or None. If None, we return all of the
- data.
-
- Returns:
- tuple (samples, labels), potentially restricted to the specified subset.
- """
- if not validation_split:
- return samples, labels
-
- num_val_samples = int(validation_split * len(samples))
- if subset == 'training':
- print('Using %d files for training.' % (len(samples) - num_val_samples,))
- samples = samples[:-num_val_samples]
- labels = labels[:-num_val_samples]
- elif subset == 'validation':
- print('Using %d files for validation.' % (num_val_samples,))
- samples = samples[-num_val_samples:]
- labels = labels[-num_val_samples:]
- else:
- raise ValueError('`subset` must be either "training" '
- 'or "validation", received: %s' % (subset,))
- return samples, labels
-
-
-def labels_to_dataset(labels, label_mode, num_classes):
- """Create a tf.data.Dataset from the list/tuple of labels.
-
- Args:
- labels: list/tuple of labels to be converted into a tf.data.Dataset.
- label_mode:
- - 'binary' indicates that the labels (there can be only 2) are encoded as
- `float32` scalars with values 0 or 1 (e.g. for `binary_crossentropy`).
- - 'categorical' means that the labels are mapped into a categorical vector.
- (e.g. for `categorical_crossentropy` loss).
- num_classes: number of classes of labels.
- """
- label_ds = dataset_ops.Dataset.from_tensor_slices(labels)
- if label_mode == 'binary':
- label_ds = label_ds.map(
- lambda x: array_ops.expand_dims(math_ops.cast(x, 'float32'), axis=-1))
- elif label_mode == 'categorical':
- label_ds = label_ds.map(lambda x: array_ops.one_hot(x, num_classes))
- return label_ds
-
-
-def check_validation_split_arg(validation_split, subset, shuffle, seed):
- """Raise errors in case of invalid argument values.
-
- Args:
- shuffle: Whether to shuffle the data. Either True or False.
- seed: random seed for shuffling and transformations.
- validation_split: float between 0 and 1, fraction of data to reserve for
- validation.
- subset: One of "training" or "validation". Only used if `validation_split`
- is set.
- """
- if validation_split and not 0 < validation_split < 1:
- raise ValueError(
- '`validation_split` must be between 0 and 1, received: %s' %
- (validation_split,))
- if (validation_split or subset) and not (validation_split and subset):
- raise ValueError(
- 'If `subset` is set, `validation_split` must be set, and inversely.')
- if subset not in ('training', 'validation', None):
- raise ValueError('`subset` must be either "training" '
- 'or "validation", received: %s' % (subset,))
- if validation_split and shuffle and seed is None:
- raise ValueError(
- 'If using `validation_split` and shuffling the data, you must provide '
- 'a `seed` argument, to make sure that there is no overlap between the '
- 'training and validation subset.')
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
deleted file mode 100644
index 6c875e1..0000000
--- a/tensorflow/python/keras/preprocessing/image.py
+++ /dev/null
@@ -1,1152 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# pylint: disable=invalid-name
-# pylint: disable=g-import-not-at-top
-# pylint: disable=g-classes-have-attributes
-"""Set of tools for real-time data augmentation on image data."""
-
-from keras_preprocessing import image
-import numpy as np
-try:
- from scipy import linalg # pylint: disable=unused-import
- from scipy import ndimage # pylint: disable=unused-import
-except ImportError:
- pass
-
-from tensorflow.python.framework import ops
-from tensorflow.python.keras import backend
-from tensorflow.python.keras.preprocessing.image_dataset import image_dataset_from_directory # pylint: disable=unused-import
-from tensorflow.python.keras.utils import data_utils
-from tensorflow.python.keras.utils import tf_inspect
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import image_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.util.tf_export import keras_export
-
-random_rotation = image.random_rotation
-random_shift = image.random_shift
-random_shear = image.random_shear
-random_zoom = image.random_zoom
-apply_channel_shift = image.apply_channel_shift
-random_channel_shift = image.random_channel_shift
-apply_brightness_shift = image.apply_brightness_shift
-random_brightness = image.random_brightness
-apply_affine_transform = image.apply_affine_transform
-
-
-@keras_export('keras.preprocessing.image.smart_resize', v1=[])
-def smart_resize(x, size, interpolation='bilinear'):
- """Resize images to a target size without aspect ratio distortion.
-
- TensorFlow image datasets typically yield images that have each a different
- size. However, these images need to be batched before they can be
- processed by Keras layers. To be batched, images need to share the same height
- and width.
-
- You could simply do:
-
- ```python
- size = (200, 200)
- ds = ds.map(lambda img: tf.image.resize(img, size))
- ```
-
- However, if you do this, you distort the aspect ratio of your images, since
- in general they do not all have the same aspect ratio as `size`. This is
- fine in many cases, but not always (e.g. for GANs this can be a problem).
-
- Note that passing the argument `preserve_aspect_ratio=True` to `resize`
- will preserve the aspect ratio, but at the cost of no longer respecting the
- provided target size. Because `tf.image.resize` doesn't crop images,
- your output images will still have different sizes.
-
- This calls for:
-
- ```python
- size = (200, 200)
- ds = ds.map(lambda img: smart_resize(img, size))
- ```
-
- Your output images will actually be `(200, 200)`, and will not be distorted.
- Instead, the parts of the image that do not fit within the target size
- get cropped out.
-
- The resizing process is:
-
- 1. Take the largest centered crop of the image that has the same aspect ratio
- as the target size. For instance, if `size=(200, 200)` and the input image has
- size `(340, 500)`, we take a crop of `(340, 340)` centered along the width.
- 2. Resize the cropped image to the target size. In the example above,
- we resize the `(340, 340)` crop to `(200, 200)`.
-
- Args:
- x: Input image or batch of images (as a tensor or NumPy array).
- Must be in format `(height, width, channels)` or
- `(batch_size, height, width, channels)`.
- size: Tuple of `(height, width)` integer. Target size.
- interpolation: String, interpolation to use for resizing.
- Defaults to `'bilinear'`. Supports `bilinear`, `nearest`, `bicubic`,
- `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
-
- Returns:
- Array with shape `(size[0], size[1], channels)`. If the input image was a
- NumPy array, the output is a NumPy array, and if it was a TF tensor,
- the output is a TF tensor.
- """
- if len(size) != 2:
- raise ValueError('Expected `size` to be a tuple of 2 integers, '
- 'but got: %s' % (size,))
- img = ops.convert_to_tensor_v2_with_dispatch(x)
- if img.shape.rank is not None:
- if img.shape.rank < 3 or img.shape.rank > 4:
- raise ValueError(
- 'Expected an image array with shape `(height, width, channels)`, '
- 'or `(batch_size, height, width, channels)` but '
- 'got input with incorrect rank, of shape %s' % (img.shape,))
- shape = array_ops.shape(img)
- if img.shape.rank == 4:
- height, width = shape[1], shape[2]
- static_num_channels = img.shape[-1]
- else:
- height, width = shape[0], shape[1]
- target_height, target_width = size
-
- crop_height = math_ops.cast(
- math_ops.cast(width * target_height, 'float32') / target_width, 'int32')
- crop_width = math_ops.cast(
- math_ops.cast(height * target_width, 'float32') / target_height, 'int32')
-
- # Set back to input height / width if crop_height / crop_width is not smaller.
- crop_height = math_ops.minimum(height, crop_height)
- crop_width = math_ops.minimum(width, crop_width)
-
- crop_box_hstart = math_ops.cast(
- math_ops.cast(height - crop_height, 'float32') / 2, 'int32')
- crop_box_wstart = math_ops.cast(
- math_ops.cast(width - crop_width, 'float32') / 2, 'int32')
-
- if img.shape.rank == 4:
- crop_box_start = array_ops.stack([0, crop_box_hstart, crop_box_wstart, 0])
- crop_box_size = array_ops.stack([-1, crop_height, crop_width, -1])
- else:
- crop_box_start = array_ops.stack([crop_box_hstart, crop_box_wstart, 0])
- crop_box_size = array_ops.stack([crop_height, crop_width, -1])
-
- img = array_ops.slice(img, crop_box_start, crop_box_size)
- img = image_ops.resize_images_v2(
- images=img,
- size=size,
- method=interpolation)
- if img.shape.rank == 4:
- # Apparent bug in resize_images_v2 may cause shape to be lost
- img.set_shape((None, None, None, static_num_channels))
- if isinstance(x, np.ndarray):
- return img.numpy()
- return img
-
-
-@keras_export('keras.utils.array_to_img',
- 'keras.preprocessing.image.array_to_img')
-def array_to_img(x, data_format=None, scale=True, dtype=None):
- """Converts a 3D Numpy array to a PIL Image instance.
-
- Usage:
-
- ```python
- from PIL import Image
- img = np.random.random(size=(100, 100, 3))
- pil_img = tf.keras.preprocessing.image.array_to_img(img)
- ```
-
-
- Args:
- x: Input data, in any form that can be converted to a Numpy array.
- data_format: Image data format, can be either "channels_first" or
- "channels_last". Defaults to `None`, in which case the global setting
- `tf.keras.backend.image_data_format()` is used (unless you changed it,
- it defaults to "channels_last").
- scale: Whether to rescale the image such that minimum and maximum values
- are 0 and 255 respectively. Defaults to `True`.
- dtype: Dtype to use. Default to `None`, in which case the global setting
- `tf.keras.backend.floatx()` is used (unless you changed it, it defaults
- to "float32")
-
- Returns:
- A PIL Image instance.
-
- Raises:
- ImportError: if PIL is not available.
- ValueError: if invalid `x` or `data_format` is passed.
- """
-
- if data_format is None:
- data_format = backend.image_data_format()
- kwargs = {}
- if 'dtype' in tf_inspect.getfullargspec(image.array_to_img)[0]:
- if dtype is None:
- dtype = backend.floatx()
- kwargs['dtype'] = dtype
- return image.array_to_img(x, data_format=data_format, scale=scale, **kwargs)
-
-
-@keras_export('keras.utils.img_to_array',
- 'keras.preprocessing.image.img_to_array')
-def img_to_array(img, data_format=None, dtype=None):
- """Converts a PIL Image instance to a Numpy array.
-
- Usage:
-
- ```python
- from PIL import Image
- img_data = np.random.random(size=(100, 100, 3))
- img = tf.keras.preprocessing.image.array_to_img(img_data)
- array = tf.keras.preprocessing.image.img_to_array(img)
- ```
-
-
- Args:
- img: Input PIL Image instance.
- data_format: Image data format, can be either "channels_first" or
- "channels_last". Defaults to `None`, in which case the global setting
- `tf.keras.backend.image_data_format()` is used (unless you changed it,
- it defaults to "channels_last").
- dtype: Dtype to use. Default to `None`, in which case the global setting
- `tf.keras.backend.floatx()` is used (unless you changed it, it defaults
- to "float32")
-
- Returns:
- A 3D Numpy array.
-
- Raises:
- ValueError: if invalid `img` or `data_format` is passed.
- """
-
- if data_format is None:
- data_format = backend.image_data_format()
- kwargs = {}
- if 'dtype' in tf_inspect.getfullargspec(image.img_to_array)[0]:
- if dtype is None:
- dtype = backend.floatx()
- kwargs['dtype'] = dtype
- return image.img_to_array(img, data_format=data_format, **kwargs)
-
-
-@keras_export('keras.utils.save_img',
- 'keras.preprocessing.image.save_img')
-def save_img(path,
- x,
- data_format=None,
- file_format=None,
- scale=True,
- **kwargs):
- """Saves an image stored as a Numpy array to a path or file object.
-
- Args:
- path: Path or file object.
- x: Numpy array.
- data_format: Image data format,
- either "channels_first" or "channels_last".
- file_format: Optional file format override. If omitted, the
- format to use is determined from the filename extension.
- If a file object was used instead of a filename, this
- parameter should always be used.
- scale: Whether to rescale image values to be within `[0, 255]`.
- **kwargs: Additional keyword arguments passed to `PIL.Image.save()`.
- """
- if data_format is None:
- data_format = backend.image_data_format()
- image.save_img(path,
- x,
- data_format=data_format,
- file_format=file_format,
- scale=scale, **kwargs)
-
-
-@keras_export('keras.utils.load_img',
- 'keras.preprocessing.image.load_img')
-def load_img(path, grayscale=False, color_mode='rgb', target_size=None,
- interpolation='nearest'):
- """Loads an image into PIL format.
-
- Usage:
-
- ```
- image = tf.keras.preprocessing.image.load_img(image_path)
- input_arr = tf.keras.preprocessing.image.img_to_array(image)
- input_arr = np.array([input_arr]) # Convert single image to a batch.
- predictions = model.predict(input_arr)
- ```
-
- Args:
- path: Path to image file.
- grayscale: DEPRECATED use `color_mode="grayscale"`.
- color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
- The desired image format.
- target_size: Either `None` (default to original size)
- or tuple of ints `(img_height, img_width)`.
- interpolation: Interpolation method used to resample the image if the
- target size is different from that of the loaded image.
- Supported methods are "nearest", "bilinear", and "bicubic".
- If PIL version 1.1.3 or newer is installed, "lanczos" is also
- supported. If PIL version 3.4.0 or newer is installed, "box" and
- "hamming" are also supported. By default, "nearest" is used.
-
- Returns:
- A PIL Image instance.
-
- Raises:
- ImportError: if PIL is not available.
- ValueError: if interpolation method is not supported.
- """
- return image.load_img(path, grayscale=grayscale, color_mode=color_mode,
- target_size=target_size, interpolation=interpolation)
-
-
-@keras_export('keras.preprocessing.image.Iterator')
-class Iterator(image.Iterator, data_utils.Sequence):
- pass
-
-
-@keras_export('keras.preprocessing.image.DirectoryIterator')
-class DirectoryIterator(image.DirectoryIterator, Iterator): # pylint: disable=inconsistent-mro
- """Iterator capable of reading images from a directory on disk.
-
- Args:
- directory: Path to the directory to read images from.
- Each subdirectory in this directory will be
- considered to contain images from one class,
- or alternatively you could specify class subdirectories
- via the `classes` argument.
- image_data_generator: Instance of `ImageDataGenerator`
- to use for random transformations and normalization.
- target_size: tuple of integers, dimensions to resize input images to.
- color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`.
- Color mode to read images.
- classes: Optional list of strings, names of subdirectories
- containing images from each class (e.g. `["dogs", "cats"]`).
- It will be computed automatically if not set.
- class_mode: Mode for yielding the targets:
- - `"binary"`: binary targets (if there are only two classes),
- - `"categorical"`: categorical targets,
- - `"sparse"`: integer targets,
- - `"input"`: targets are images identical to input images (mainly
- used to work with autoencoders),
- - `None`: no targets get yielded (only input images are yielded).
- batch_size: Integer, size of a batch.
- shuffle: Boolean, whether to shuffle the data between epochs.
- seed: Random seed for data shuffling.
- data_format: String, one of `channels_first`, `channels_last`.
- save_to_dir: Optional directory where to save the pictures
- being yielded, in a viewable format. This is useful
- for visualizing the random transformations being
- applied, for debugging purposes.
- save_prefix: String prefix to use for saving sample
- images (if `save_to_dir` is set).
- save_format: Format to use for saving sample images
- (if `save_to_dir` is set).
- subset: Subset of data (`"training"` or `"validation"`) if
- validation_split is set in ImageDataGenerator.
- interpolation: Interpolation method used to resample the image if the
- target size is different from that of the loaded image.
- Supported methods are "nearest", "bilinear", and "bicubic".
- If PIL version 1.1.3 or newer is installed, "lanczos" is also
- supported. If PIL version 3.4.0 or newer is installed, "box" and
- "hamming" are also supported. By default, "nearest" is used.
- dtype: Dtype to use for generated arrays.
- """
-
- def __init__(self, directory, image_data_generator,
- target_size=(256, 256),
- color_mode='rgb',
- classes=None,
- class_mode='categorical',
- batch_size=32,
- shuffle=True,
- seed=None,
- data_format=None,
- save_to_dir=None,
- save_prefix='',
- save_format='png',
- follow_links=False,
- subset=None,
- interpolation='nearest',
- dtype=None):
- if data_format is None:
- data_format = backend.image_data_format()
- kwargs = {}
- if 'dtype' in tf_inspect.getfullargspec(
- image.ImageDataGenerator.__init__)[0]:
- if dtype is None:
- dtype = backend.floatx()
- kwargs['dtype'] = dtype
- super(DirectoryIterator, self).__init__(
- directory, image_data_generator,
- target_size=target_size,
- color_mode=color_mode,
- classes=classes,
- class_mode=class_mode,
- batch_size=batch_size,
- shuffle=shuffle,
- seed=seed,
- data_format=data_format,
- save_to_dir=save_to_dir,
- save_prefix=save_prefix,
- save_format=save_format,
- follow_links=follow_links,
- subset=subset,
- interpolation=interpolation,
- **kwargs)
-
-
-@keras_export('keras.preprocessing.image.NumpyArrayIterator')
-class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
- """Iterator yielding data from a Numpy array.
-
- Args:
- x: Numpy array of input data or tuple.
- If tuple, the second elements is either
- another numpy array or a list of numpy arrays,
- each of which gets passed
- through as an output without any modifications.
- y: Numpy array of targets data.
- image_data_generator: Instance of `ImageDataGenerator`
- to use for random transformations and normalization.
- batch_size: Integer, size of a batch.
- shuffle: Boolean, whether to shuffle the data between epochs.
- sample_weight: Numpy array of sample weights.
- seed: Random seed for data shuffling.
- data_format: String, one of `channels_first`, `channels_last`.
- save_to_dir: Optional directory where to save the pictures
- being yielded, in a viewable format. This is useful
- for visualizing the random transformations being
- applied, for debugging purposes.
- save_prefix: String prefix to use for saving sample
- images (if `save_to_dir` is set).
- save_format: Format to use for saving sample images
- (if `save_to_dir` is set).
- subset: Subset of data (`"training"` or `"validation"`) if
- validation_split is set in ImageDataGenerator.
- dtype: Dtype to use for the generated arrays.
- """
-
- def __init__(self, x, y, image_data_generator,
- batch_size=32,
- shuffle=False,
- sample_weight=None,
- seed=None,
- data_format=None,
- save_to_dir=None,
- save_prefix='',
- save_format='png',
- subset=None,
- dtype=None):
- if data_format is None:
- data_format = backend.image_data_format()
- kwargs = {}
- if 'dtype' in tf_inspect.getfullargspec(
- image.NumpyArrayIterator.__init__)[0]:
- if dtype is None:
- dtype = backend.floatx()
- kwargs['dtype'] = dtype
- super(NumpyArrayIterator, self).__init__(
- x, y, image_data_generator,
- batch_size=batch_size,
- shuffle=shuffle,
- sample_weight=sample_weight,
- seed=seed,
- data_format=data_format,
- save_to_dir=save_to_dir,
- save_prefix=save_prefix,
- save_format=save_format,
- subset=subset,
- **kwargs)
-
-
-class DataFrameIterator(image.DataFrameIterator, Iterator): # pylint: disable=inconsistent-mro
- """Iterator capable of reading images from a directory on disk as a dataframe.
-
- Args:
- dataframe: Pandas dataframe containing the filepaths relative to
- `directory` (or absolute paths if `directory` is None) of the images in
- a string column. It should include other column/s depending on the
- `class_mode`:
- - if `class_mode` is `"categorical"` (default value) it must include
- the `y_col` column with the class/es of each image. Values in
- column can be string/list/tuple if a single class or list/tuple if
- multiple classes.
- - if `class_mode` is `"binary"` or `"sparse"` it must include the
- given `y_col` column with class values as strings.
- - if `class_mode` is `"raw"` or `"multi_output"` it should contain the
- columns specified in `y_col`.
- - if `class_mode` is `"input"` or `None` no extra column is needed.
- directory: string, path to the directory to read images from. If `None`,
- data in `x_col` column should be absolute paths.
- image_data_generator: Instance of `ImageDataGenerator` to use for random
- transformations and normalization. If None, no transformations and
- normalizations are made.
- x_col: string, column in `dataframe` that contains the filenames (or
- absolute paths if `directory` is `None`).
- y_col: string or list, column/s in `dataframe` that has the target data.
- weight_col: string, column in `dataframe` that contains the sample
- weights. Default: `None`.
- target_size: tuple of integers, dimensions to resize input images to.
- color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
- images.
- classes: Optional list of strings, classes to use (e.g. `["dogs",
- "cats"]`). If None, all classes in `y_col` will be used.
- class_mode: one of "binary", "categorical", "input", "multi_output",
- "raw", "sparse" or None. Default: "categorical".
- Mode for yielding the targets:
- - `"binary"`: 1D numpy array of binary labels,
- - `"categorical"`: 2D numpy array of one-hot encoded labels. Supports
- multi-label output.
- - `"input"`: images identical to input images (mainly used to work
- with autoencoders),
- - `"multi_output"`: list with the values of the different columns,
- - `"raw"`: numpy array of values in `y_col` column(s),
- - `"sparse"`: 1D numpy array of integer labels,
- - `None`, no targets are returned (the generator will only yield
- batches of image data, which is useful to use in `model.predict()`).
- batch_size: Integer, size of a batch.
- shuffle: Boolean, whether to shuffle the data between epochs.
- seed: Random seed for data shuffling.
- data_format: String, one of `channels_first`, `channels_last`.
- save_to_dir: Optional directory where to save the pictures being yielded,
- in a viewable format. This is useful for visualizing the random
- transformations being applied, for debugging purposes.
- save_prefix: String prefix to use for saving sample images (if
- `save_to_dir` is set).
- save_format: Format to use for saving sample images (if `save_to_dir` is
- set).
- subset: Subset of data (`"training"` or `"validation"`) if
- validation_split is set in ImageDataGenerator.
- interpolation: Interpolation method used to resample the image if the
- target size is different from that of the loaded image. Supported
- methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
- or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
- or newer is installed, "box" and "hamming" are also supported. By
- default, "nearest" is used.
- dtype: Dtype to use for the generated arrays.
- validate_filenames: Boolean, whether to validate image filenames in
- `x_col`. If `True`, invalid images will be ignored. Disabling this
- option
- can lead to speed-up in the instantiation of this class. Default: `True`.
- """
-
- def __init__(
- self,
- dataframe,
- directory=None,
- image_data_generator=None,
- x_col='filename',
- y_col='class',
- weight_col=None,
- target_size=(256, 256),
- color_mode='rgb',
- classes=None,
- class_mode='categorical',
- batch_size=32,
- shuffle=True,
- seed=None,
- data_format='channels_last',
- save_to_dir=None,
- save_prefix='',
- save_format='png',
- subset=None,
- interpolation='nearest',
- dtype='float32',
- validate_filenames=True):
- super(DataFrameIterator, self).__init__(
- dataframe=dataframe,
- directory=directory,
- image_data_generator=image_data_generator,
- x_col=x_col,
- y_col=y_col,
- weight_col=weight_col,
- target_size=target_size,
- color_mode=color_mode,
- classes=classes,
- class_mode=class_mode,
- batch_size=batch_size,
- shuffle=shuffle,
- seed=seed,
- data_format=data_format,
- save_to_dir=save_to_dir,
- save_prefix=save_prefix,
- save_format=save_format,
- subset=subset,
- interpolation=interpolation,
- dtype=dtype,
- validate_filenames=validate_filenames
- )
-
-
-@keras_export('keras.preprocessing.image.ImageDataGenerator')
-class ImageDataGenerator(image.ImageDataGenerator):
- """Generate batches of tensor image data with real-time data augmentation.
-
- The data will be looped over (in batches).
-
- Args:
- featurewise_center: Boolean.
- Set input mean to 0 over the dataset, feature-wise.
- samplewise_center: Boolean. Set each sample mean to 0.
- featurewise_std_normalization: Boolean.
- Divide inputs by std of the dataset, feature-wise.
- samplewise_std_normalization: Boolean. Divide each input by its std.
- zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
- zca_whitening: Boolean. Apply ZCA whitening.
- rotation_range: Int. Degree range for random rotations.
- width_shift_range: Float, 1-D array-like or int
- - float: fraction of total width, if < 1, or pixels if >= 1.
- - 1-D array-like: random elements from the array.
- - int: integer number of pixels from interval
- `(-width_shift_range, +width_shift_range)`
- - With `width_shift_range=2` possible values
- are integers `[-1, 0, +1]`,
- same as with `width_shift_range=[-1, 0, +1]`,
- while with `width_shift_range=1.0` possible values are floats
- in the interval [-1.0, +1.0).
- height_shift_range: Float, 1-D array-like or int
- - float: fraction of total height, if < 1, or pixels if >= 1.
- - 1-D array-like: random elements from the array.
- - int: integer number of pixels from interval
- `(-height_shift_range, +height_shift_range)`
- - With `height_shift_range=2` possible values
- are integers `[-1, 0, +1]`,
- same as with `height_shift_range=[-1, 0, +1]`,
- while with `height_shift_range=1.0` possible values are floats
- in the interval [-1.0, +1.0).
- brightness_range: Tuple or list of two floats. Range for picking
- a brightness shift value from.
- shear_range: Float. Shear Intensity
- (Shear angle in counter-clockwise direction in degrees)
- zoom_range: Float or [lower, upper]. Range for random zoom.
- If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
- channel_shift_range: Float. Range for random channel shifts.
- fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}.
- Default is 'nearest'.
- Points outside the boundaries of the input are filled
- according to the given mode:
- - 'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
- - 'nearest': aaaaaaaa|abcd|dddddddd
- - 'reflect': abcddcba|abcd|dcbaabcd
- - 'wrap': abcdabcd|abcd|abcdabcd
- cval: Float or Int.
- Value used for points outside the boundaries
- when `fill_mode = "constant"`.
- horizontal_flip: Boolean. Randomly flip inputs horizontally.
- vertical_flip: Boolean. Randomly flip inputs vertically.
- rescale: rescaling factor. Defaults to None.
- If None or 0, no rescaling is applied,
- otherwise we multiply the data by the value provided
- (after applying all other transformations).
- preprocessing_function: function that will be applied on each input.
- The function will run after the image is resized and augmented.
- The function should take one argument:
- one image (Numpy tensor with rank 3),
- and should output a Numpy tensor with the same shape.
- data_format: Image data format,
- either "channels_first" or "channels_last".
- "channels_last" mode means that the images should have shape
- `(samples, height, width, channels)`,
- "channels_first" mode means that the images should have shape
- `(samples, channels, height, width)`.
- It defaults to the `image_data_format` value found in your
- Keras config file at `~/.keras/keras.json`.
- If you never set it, then it will be "channels_last".
- validation_split: Float. Fraction of images reserved for validation
- (strictly between 0 and 1).
- dtype: Dtype to use for the generated arrays.
-
- Raises:
- ValueError: If the value of the argument, `data_format` is other than
- `"channels_last"` or `"channels_first"`.
- ValueError: If the value of the argument, `validation_split` > 1
- or `validation_split` < 0.
-
- Examples:
-
- Example of using `.flow(x, y)`:
-
- ```python
- (x_train, y_train), (x_test, y_test) = cifar10.load_data()
- y_train = utils.to_categorical(y_train, num_classes)
- y_test = utils.to_categorical(y_test, num_classes)
- datagen = ImageDataGenerator(
- featurewise_center=True,
- featurewise_std_normalization=True,
- rotation_range=20,
- width_shift_range=0.2,
- height_shift_range=0.2,
- horizontal_flip=True,
- validation_split=0.2)
- # compute quantities required for featurewise normalization
- # (std, mean, and principal components if ZCA whitening is applied)
- datagen.fit(x_train)
- # fits the model on batches with real-time data augmentation:
- model.fit(datagen.flow(x_train, y_train, batch_size=32,
- subset='training'),
- validation_data=datagen.flow(x_train, y_train,
- batch_size=8, subset='validation'),
- steps_per_epoch=len(x_train) / 32, epochs=epochs)
- # here's a more "manual" example
- for e in range(epochs):
- print('Epoch', e)
- batches = 0
- for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
- model.fit(x_batch, y_batch)
- batches += 1
- if batches >= len(x_train) / 32:
- # we need to break the loop by hand because
- # the generator loops indefinitely
- break
- ```
-
- Example of using `.flow_from_directory(directory)`:
-
- ```python
- train_datagen = ImageDataGenerator(
- rescale=1./255,
- shear_range=0.2,
- zoom_range=0.2,
- horizontal_flip=True)
- test_datagen = ImageDataGenerator(rescale=1./255)
- train_generator = train_datagen.flow_from_directory(
- 'data/train',
- target_size=(150, 150),
- batch_size=32,
- class_mode='binary')
- validation_generator = test_datagen.flow_from_directory(
- 'data/validation',
- target_size=(150, 150),
- batch_size=32,
- class_mode='binary')
- model.fit(
- train_generator,
- steps_per_epoch=2000,
- epochs=50,
- validation_data=validation_generator,
- validation_steps=800)
- ```
-
- Example of transforming images and masks together.
-
- ```python
- # we create two instances with the same arguments
- data_gen_args = dict(featurewise_center=True,
- featurewise_std_normalization=True,
- rotation_range=90,
- width_shift_range=0.1,
- height_shift_range=0.1,
- zoom_range=0.2)
- image_datagen = ImageDataGenerator(**data_gen_args)
- mask_datagen = ImageDataGenerator(**data_gen_args)
- # Provide the same seed and keyword arguments to the fit and flow methods
- seed = 1
- image_datagen.fit(images, augment=True, seed=seed)
- mask_datagen.fit(masks, augment=True, seed=seed)
- image_generator = image_datagen.flow_from_directory(
- 'data/images',
- class_mode=None,
- seed=seed)
- mask_generator = mask_datagen.flow_from_directory(
- 'data/masks',
- class_mode=None,
- seed=seed)
- # combine generators into one which yields image and masks
- train_generator = zip(image_generator, mask_generator)
- model.fit(
- train_generator,
- steps_per_epoch=2000,
- epochs=50)
- ```
- """
-
- def __init__(self,
- featurewise_center=False,
- samplewise_center=False,
- featurewise_std_normalization=False,
- samplewise_std_normalization=False,
- zca_whitening=False,
- zca_epsilon=1e-6,
- rotation_range=0,
- width_shift_range=0.,
- height_shift_range=0.,
- brightness_range=None,
- shear_range=0.,
- zoom_range=0.,
- channel_shift_range=0.,
- fill_mode='nearest',
- cval=0.,
- horizontal_flip=False,
- vertical_flip=False,
- rescale=None,
- preprocessing_function=None,
- data_format=None,
- validation_split=0.0,
- dtype=None):
- if data_format is None:
- data_format = backend.image_data_format()
- kwargs = {}
- if 'dtype' in tf_inspect.getfullargspec(
- image.ImageDataGenerator.__init__)[0]:
- if dtype is None:
- dtype = backend.floatx()
- kwargs['dtype'] = dtype
- super(ImageDataGenerator, self).__init__(
- featurewise_center=featurewise_center,
- samplewise_center=samplewise_center,
- featurewise_std_normalization=featurewise_std_normalization,
- samplewise_std_normalization=samplewise_std_normalization,
- zca_whitening=zca_whitening,
- zca_epsilon=zca_epsilon,
- rotation_range=rotation_range,
- width_shift_range=width_shift_range,
- height_shift_range=height_shift_range,
- brightness_range=brightness_range,
- shear_range=shear_range,
- zoom_range=zoom_range,
- channel_shift_range=channel_shift_range,
- fill_mode=fill_mode,
- cval=cval,
- horizontal_flip=horizontal_flip,
- vertical_flip=vertical_flip,
- rescale=rescale,
- preprocessing_function=preprocessing_function,
- data_format=data_format,
- validation_split=validation_split,
- **kwargs)
-
- def flow(self,
- x,
- y=None,
- batch_size=32,
- shuffle=True,
- sample_weight=None,
- seed=None,
- save_to_dir=None,
- save_prefix='',
- save_format='png',
- subset=None):
- """Takes data & label arrays, generates batches of augmented data.
-
- Args:
- x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
- element should contain the images and the second element another numpy
- array or a list of numpy arrays that gets passed to the output without
- any modifications. Can be used to feed the model miscellaneous data
- along with the images. In case of grayscale data, the channels axis of
- the image array should have value 1, in case of RGB data, it should
- have value 3, and in case of RGBA data, it should have value 4.
- y: Labels.
- batch_size: Int (default: 32).
- shuffle: Boolean (default: True).
- sample_weight: Sample weights.
- seed: Int (default: None).
- save_to_dir: None or str (default: None). This allows you to optionally
- specify a directory to which to save the augmented pictures being
- generated (useful for visualizing what you are doing).
- save_prefix: Str (default: `''`). Prefix to use for filenames of saved
- pictures (only relevant if `save_to_dir` is set).
- save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif",
- "tif", "jpg"
- (only relevant if `save_to_dir` is set). Default: "png".
- subset: Subset of data (`"training"` or `"validation"`) if
- `validation_split` is set in `ImageDataGenerator`.
-
- Returns:
- An `Iterator` yielding tuples of `(x, y)`
- where `x` is a numpy array of image data
- (in the case of a single image input) or a list
- of numpy arrays (in the case with
- additional inputs) and `y` is a numpy array
- of corresponding labels. If 'sample_weight' is not None,
- the yielded tuples are of the form `(x, y, sample_weight)`.
- If `y` is None, only the numpy array `x` is returned.
- Raises:
- ValueError: If the Value of the argument, `subset` is other than
- "training" or "validation".
-
- """
- return NumpyArrayIterator(
- x,
- y,
- self,
- batch_size=batch_size,
- shuffle=shuffle,
- sample_weight=sample_weight,
- seed=seed,
- data_format=self.data_format,
- save_to_dir=save_to_dir,
- save_prefix=save_prefix,
- save_format=save_format,
- subset=subset)
-
- def flow_from_directory(self,
- directory,
- target_size=(256, 256),
- color_mode='rgb',
- classes=None,
- class_mode='categorical',
- batch_size=32,
- shuffle=True,
- seed=None,
- save_to_dir=None,
- save_prefix='',
- save_format='png',
- follow_links=False,
- subset=None,
- interpolation='nearest'):
- """Takes the path to a directory & generates batches of augmented data.
-
- Args:
- directory: string, path to the target directory. It should contain one
- subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside
- each of the subdirectories directory tree will be included in the
- generator. See [this script](
- https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
- for more details.
- target_size: Tuple of integers `(height, width)`, defaults to `(256,
- 256)`. The dimensions to which all images found will be resized.
- color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb". Whether
- the images will be converted to have 1, 3, or 4 channels.
- classes: Optional list of class subdirectories
- (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list
- of classes will be automatically inferred from the subdirectory
- names/structure under `directory`, where each subdirectory will be
- treated as a different class (and the order of the classes, which
- will map to the label indices, will be alphanumeric). The
- dictionary containing the mapping from class names to class
- indices can be obtained via the attribute `class_indices`.
- class_mode: One of "categorical", "binary", "sparse",
- "input", or None. Default: "categorical".
- Determines the type of label arrays that are returned:
- - "categorical" will be 2D one-hot encoded labels,
- - "binary" will be 1D binary labels,
- - "sparse" will be 1D integer labels,
- - "input" will be images identical to input images (mainly used to
- work with autoencoders).
- - If None, no labels are returned (the generator will only yield
- batches of image data, which is useful to use with
- `model.predict()`).
- Please note that in case of class_mode None, the data still needs to
- reside in a subdirectory of `directory` for it to work correctly.
- batch_size: Size of the batches of data (default: 32).
- shuffle: Whether to shuffle the data (default: True) If set to False,
- sorts the data in alphanumeric order.
- seed: Optional random seed for shuffling and transformations.
- save_to_dir: None or str (default: None). This allows you to optionally
- specify a directory to which to save the augmented pictures being
- generated (useful for visualizing what you are doing).
- save_prefix: Str. Prefix to use for filenames of saved pictures (only
- relevant if `save_to_dir` is set).
- save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif",
- "tif", "jpg"
- (only relevant if `save_to_dir` is set). Default: "png".
- follow_links: Whether to follow symlinks inside
- class subdirectories (default: False).
- subset: Subset of data (`"training"` or `"validation"`) if
- `validation_split` is set in `ImageDataGenerator`.
- interpolation: Interpolation method used to resample the image if the
- target size is different from that of the loaded image. Supported
- methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
- 1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
- version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
- supported. By default, `"nearest"` is used.
-
- Returns:
- A `DirectoryIterator` yielding tuples of `(x, y)`
- where `x` is a numpy array containing a batch
- of images with shape `(batch_size, *target_size, channels)`
- and `y` is a numpy array of corresponding labels.
- """
- return DirectoryIterator(
- directory,
- self,
- target_size=target_size,
- color_mode=color_mode,
- classes=classes,
- class_mode=class_mode,
- data_format=self.data_format,
- batch_size=batch_size,
- shuffle=shuffle,
- seed=seed,
- save_to_dir=save_to_dir,
- save_prefix=save_prefix,
- save_format=save_format,
- follow_links=follow_links,
- subset=subset,
- interpolation=interpolation)
-
- def flow_from_dataframe(self,
- dataframe,
- directory=None,
- x_col='filename',
- y_col='class',
- weight_col=None,
- target_size=(256, 256),
- color_mode='rgb',
- classes=None,
- class_mode='categorical',
- batch_size=32,
- shuffle=True,
- seed=None,
- save_to_dir=None,
- save_prefix='',
- save_format='png',
- subset=None,
- interpolation='nearest',
- validate_filenames=True,
- **kwargs):
- """Takes the dataframe and the path to a directory + generates batches.
-
- The generated batches contain augmented/normalized data.
-
- **A simple tutorial can be found **[here](
- http://bit.ly/keras_flow_from_dataframe).
-
- Args:
- dataframe: Pandas dataframe containing the filepaths relative to
- `directory` (or absolute paths if `directory` is None) of the images
- in a string column. It should include other column/s
- depending on the `class_mode`:
- - if `class_mode` is `"categorical"` (default value) it must include
- the `y_col` column with the class/es of each image. Values in
- column can be string/list/tuple if a single class or list/tuple if
- multiple classes.
- - if `class_mode` is `"binary"` or `"sparse"` it must include the
- given `y_col` column with class values as strings.
- - if `class_mode` is `"raw"` or `"multi_output"` it should contain
- the columns specified in `y_col`.
- - if `class_mode` is `"input"` or `None` no extra column is needed.
- directory: string, path to the directory to read images from. If `None`,
- data in `x_col` column should be absolute paths.
- x_col: string, column in `dataframe` that contains the filenames (or
- absolute paths if `directory` is `None`).
- y_col: string or list, column/s in `dataframe` that has the target data.
- weight_col: string, column in `dataframe` that contains the sample
- weights. Default: `None`.
- target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
- The dimensions to which all images found will be resized.
- color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb". Whether
- the images will be converted to have 1 or 3 color channels.
- classes: optional list of classes (e.g. `['dogs', 'cats']`). Default is
- None. If not provided, the list of classes will be automatically
- inferred from the `y_col`, which will map to the label indices, will
- be alphanumeric). The dictionary containing the mapping from class
- names to class indices can be obtained via the attribute
- `class_indices`.
- class_mode: one of "binary", "categorical", "input", "multi_output",
- "raw", sparse" or None. Default: "categorical".
- Mode for yielding the targets:
- - `"binary"`: 1D numpy array of binary labels,
- - `"categorical"`: 2D numpy array of one-hot encoded labels.
- Supports multi-label output.
- - `"input"`: images identical to input images (mainly used to work
- with autoencoders),
- - `"multi_output"`: list with the values of the different columns,
- - `"raw"`: numpy array of values in `y_col` column(s),
- - `"sparse"`: 1D numpy array of integer labels,
- - `None`, no targets are returned (the generator will only yield
- batches of image data, which is useful to use in
- `model.predict()`).
- batch_size: size of the batches of data (default: 32).
- shuffle: whether to shuffle the data (default: True)
- seed: optional random seed for shuffling and transformations.
- save_to_dir: None or str (default: None). This allows you to optionally
- specify a directory to which to save the augmented pictures being
- generated (useful for visualizing what you are doing).
- save_prefix: str. Prefix to use for filenames of saved pictures (only
- relevant if `save_to_dir` is set).
- save_format: one of "png", "jpeg", "bmp", "pdf", "ppm", "gif",
- "tif", "jpg"
- (only relevant if `save_to_dir` is set). Default: "png".
- subset: Subset of data (`"training"` or `"validation"`) if
- `validation_split` is set in `ImageDataGenerator`.
- interpolation: Interpolation method used to resample the image if the
- target size is different from that of the loaded image. Supported
- methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
- 1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
- version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
- supported. By default, `"nearest"` is used.
- validate_filenames: Boolean, whether to validate image filenames in
- `x_col`. If `True`, invalid images will be ignored. Disabling this
- option can lead to speed-up in the execution of this function.
- Defaults to `True`.
- **kwargs: legacy arguments for raising deprecation warnings.
-
- Returns:
- A `DataFrameIterator` yielding tuples of `(x, y)`
- where `x` is a numpy array containing a batch
- of images with shape `(batch_size, *target_size, channels)`
- and `y` is a numpy array of corresponding labels.
- """
- if 'has_ext' in kwargs:
- tf_logging.warning(
- 'has_ext is deprecated, filenames in the dataframe have '
- 'to match the exact filenames in disk.', DeprecationWarning)
- if 'sort' in kwargs:
- tf_logging.warning(
- 'sort is deprecated, batches will be created in the'
- 'same order than the filenames provided if shuffle'
- 'is set to False.', DeprecationWarning)
- if class_mode == 'other':
- tf_logging.warning(
- '`class_mode` "other" is deprecated, please use '
- '`class_mode` "raw".', DeprecationWarning)
- class_mode = 'raw'
- if 'drop_duplicates' in kwargs:
- tf_logging.warning(
- 'drop_duplicates is deprecated, you can drop duplicates '
- 'by using the pandas.DataFrame.drop_duplicates method.',
- DeprecationWarning)
-
- return DataFrameIterator(
- dataframe,
- directory,
- self,
- x_col=x_col,
- y_col=y_col,
- weight_col=weight_col,
- target_size=target_size,
- color_mode=color_mode,
- classes=classes,
- class_mode=class_mode,
- data_format=self.data_format,
- batch_size=batch_size,
- shuffle=shuffle,
- seed=seed,
- save_to_dir=save_to_dir,
- save_prefix=save_prefix,
- save_format=save_format,
- subset=subset,
- interpolation=interpolation,
- validate_filenames=validate_filenames)
-
-
-keras_export('keras.preprocessing.image.random_rotation')(random_rotation)
-keras_export('keras.preprocessing.image.random_shift')(random_shift)
-keras_export('keras.preprocessing.image.random_shear')(random_shear)
-keras_export('keras.preprocessing.image.random_zoom')(random_zoom)
-keras_export(
- 'keras.preprocessing.image.apply_channel_shift')(apply_channel_shift)
-keras_export(
- 'keras.preprocessing.image.random_channel_shift')(random_channel_shift)
-keras_export(
- 'keras.preprocessing.image.apply_brightness_shift')(apply_brightness_shift)
-keras_export('keras.preprocessing.image.random_brightness')(random_brightness)
-keras_export(
- 'keras.preprocessing.image.apply_affine_transform')(apply_affine_transform)
diff --git a/tensorflow/python/keras/preprocessing/image_dataset.py b/tensorflow/python/keras/preprocessing/image_dataset.py
deleted file mode 100644
index e87159f..0000000
--- a/tensorflow/python/keras/preprocessing/image_dataset.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras image dataset loading utilities."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.keras.layers.preprocessing import image_preprocessing
-from tensorflow.python.keras.preprocessing import dataset_utils
-from tensorflow.python.keras.preprocessing import image as keras_image_ops
-from tensorflow.python.ops import image_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.util.tf_export import keras_export
-
-
-ALLOWLIST_FORMATS = ('.bmp', '.gif', '.jpeg', '.jpg', '.png')
-
-
-@keras_export('keras.utils.image_dataset_from_directory',
- 'keras.preprocessing.image_dataset_from_directory',
- v1=[])
-def image_dataset_from_directory(directory,
- labels='inferred',
- label_mode='int',
- class_names=None,
- color_mode='rgb',
- batch_size=32,
- image_size=(256, 256),
- shuffle=True,
- seed=None,
- validation_split=None,
- subset=None,
- interpolation='bilinear',
- follow_links=False,
- crop_to_aspect_ratio=False,
- **kwargs):
- """Generates a `tf.data.Dataset` from image files in a directory.
-
- If your directory structure is:
-
- ```
- main_directory/
- ...class_a/
- ......a_image_1.jpg
- ......a_image_2.jpg
- ...class_b/
- ......b_image_1.jpg
- ......b_image_2.jpg
- ```
-
- Then calling `image_dataset_from_directory(main_directory, labels='inferred')`
- will return a `tf.data.Dataset` that yields batches of images from
- the subdirectories `class_a` and `class_b`, together with labels
- 0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
-
- Supported image formats: jpeg, png, bmp, gif.
- Animated gifs are truncated to the first frame.
-
- Args:
- directory: Directory where the data is located.
- If `labels` is "inferred", it should contain
- subdirectories, each containing images for a class.
- Otherwise, the directory structure is ignored.
- labels: Either "inferred"
- (labels are generated from the directory structure),
- None (no labels),
- or a list/tuple of integer labels of the same size as the number of
- image files found in the directory. Labels should be sorted according
- to the alphanumeric order of the image file paths
- (obtained via `os.walk(directory)` in Python).
- label_mode:
- - 'int': means that the labels are encoded as integers
- (e.g. for `sparse_categorical_crossentropy` loss).
- - 'categorical' means that the labels are
- encoded as a categorical vector
- (e.g. for `categorical_crossentropy` loss).
- - 'binary' means that the labels (there can be only 2)
- are encoded as `float32` scalars with values 0 or 1
- (e.g. for `binary_crossentropy`).
- - None (no labels).
- class_names: Only valid if "labels" is "inferred". This is the explict
- list of class names (must match names of subdirectories). Used
- to control the order of the classes
- (otherwise alphanumerical order is used).
- color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb".
- Whether the images will be converted to
- have 1, 3, or 4 channels.
- batch_size: Size of the batches of data. Default: 32.
- image_size: Size to resize images to after they are read from disk.
- Defaults to `(256, 256)`.
- Since the pipeline processes batches of images that must all have
- the same size, this must be provided.
- shuffle: Whether to shuffle the data. Default: True.
- If set to False, sorts the data in alphanumeric order.
- seed: Optional random seed for shuffling and transformations.
- validation_split: Optional float between 0 and 1,
- fraction of data to reserve for validation.
- subset: One of "training" or "validation".
- Only used if `validation_split` is set.
- interpolation: String, the interpolation method used when resizing images.
- Defaults to `bilinear`. Supports `bilinear`, `nearest`, `bicubic`,
- `area`, `lanczos3`, `lanczos5`, `gaussian`, `mitchellcubic`.
- follow_links: Whether to visits subdirectories pointed to by symlinks.
- Defaults to False.
- crop_to_aspect_ratio: If True, resize the images without aspect
- ratio distortion. When the original aspect ratio differs from the target
- aspect ratio, the output image will be cropped so as to return the largest
- possible window in the image (of size `image_size`) that matches
- the target aspect ratio. By default (`crop_to_aspect_ratio=False`),
- aspect ratio may not be preserved.
- **kwargs: Legacy keyword arguments.
-
- Returns:
- A `tf.data.Dataset` object.
- - If `label_mode` is None, it yields `float32` tensors of shape
- `(batch_size, image_size[0], image_size[1], num_channels)`,
- encoding images (see below for rules regarding `num_channels`).
- - Otherwise, it yields a tuple `(images, labels)`, where `images`
- has shape `(batch_size, image_size[0], image_size[1], num_channels)`,
- and `labels` follows the format described below.
-
- Rules regarding labels format:
- - if `label_mode` is `int`, the labels are an `int32` tensor of shape
- `(batch_size,)`.
- - if `label_mode` is `binary`, the labels are a `float32` tensor of
- 1s and 0s of shape `(batch_size, 1)`.
- - if `label_mode` is `categorial`, the labels are a `float32` tensor
- of shape `(batch_size, num_classes)`, representing a one-hot
- encoding of the class index.
-
- Rules regarding number of channels in the yielded images:
- - if `color_mode` is `grayscale`,
- there's 1 channel in the image tensors.
- - if `color_mode` is `rgb`,
- there are 3 channel in the image tensors.
- - if `color_mode` is `rgba`,
- there are 4 channel in the image tensors.
- """
- if 'smart_resize' in kwargs:
- crop_to_aspect_ratio = kwargs.pop('smart_resize')
- if kwargs:
- raise TypeError(f'Unknown keywords argument(s): {tuple(kwargs.keys())}')
- if labels not in ('inferred', None):
- if not isinstance(labels, (list, tuple)):
- raise ValueError(
- '`labels` argument should be a list/tuple of integer labels, of '
- 'the same size as the number of image files in the target '
- 'directory. If you wish to infer the labels from the subdirectory '
- 'names in the target directory, pass `labels="inferred"`. '
- 'If you wish to get a dataset that only contains images '
- '(no labels), pass `label_mode=None`.')
- if class_names:
- raise ValueError('You can only pass `class_names` if the labels are '
- 'inferred from the subdirectory names in the target '
- 'directory (`labels="inferred"`).')
- if label_mode not in {'int', 'categorical', 'binary', None}:
- raise ValueError(
- '`label_mode` argument must be one of "int", "categorical", "binary", '
- 'or None. Received: %s' % (label_mode,))
- if labels is None or label_mode is None:
- labels = None
- label_mode = None
- if color_mode == 'rgb':
- num_channels = 3
- elif color_mode == 'rgba':
- num_channels = 4
- elif color_mode == 'grayscale':
- num_channels = 1
- else:
- raise ValueError(
- '`color_mode` must be one of {"rbg", "rgba", "grayscale"}. '
- 'Received: %s' % (color_mode,))
- interpolation = image_preprocessing.get_interpolation(interpolation)
- dataset_utils.check_validation_split_arg(
- validation_split, subset, shuffle, seed)
-
- if seed is None:
- seed = np.random.randint(1e6)
- image_paths, labels, class_names = dataset_utils.index_directory(
- directory,
- labels,
- formats=ALLOWLIST_FORMATS,
- class_names=class_names,
- shuffle=shuffle,
- seed=seed,
- follow_links=follow_links)
-
- if label_mode == 'binary' and len(class_names) != 2:
- raise ValueError(
- 'When passing `label_mode="binary", there must exactly 2 classes. '
- 'Found the following classes: %s' % (class_names,))
-
- image_paths, labels = dataset_utils.get_training_or_validation_split(
- image_paths, labels, validation_split, subset)
- if not image_paths:
- raise ValueError('No images found.')
-
- dataset = paths_and_labels_to_dataset(
- image_paths=image_paths,
- image_size=image_size,
- num_channels=num_channels,
- labels=labels,
- label_mode=label_mode,
- num_classes=len(class_names),
- interpolation=interpolation,
- crop_to_aspect_ratio=crop_to_aspect_ratio)
- if shuffle:
- # Shuffle locally at each iteration
- dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
- dataset = dataset.batch(batch_size)
- # Users may need to reference `class_names`.
- dataset.class_names = class_names
- # Include file paths for images as attribute.
- dataset.file_paths = image_paths
- return dataset
-
-
-def paths_and_labels_to_dataset(image_paths,
- image_size,
- num_channels,
- labels,
- label_mode,
- num_classes,
- interpolation,
- crop_to_aspect_ratio=False):
- """Constructs a dataset of images and labels."""
- # TODO(fchollet): consider making num_parallel_calls settable
- path_ds = dataset_ops.Dataset.from_tensor_slices(image_paths)
- args = (image_size, num_channels, interpolation, crop_to_aspect_ratio)
- img_ds = path_ds.map(
- lambda x: load_image(x, *args))
- if label_mode:
- label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
- img_ds = dataset_ops.Dataset.zip((img_ds, label_ds))
- return img_ds
-
-
-def load_image(path, image_size, num_channels, interpolation,
- crop_to_aspect_ratio=False):
- """Load an image from a path and resize it."""
- img = io_ops.read_file(path)
- img = image_ops.decode_image(
- img, channels=num_channels, expand_animations=False)
- if crop_to_aspect_ratio:
- img = keras_image_ops.smart_resize(img, image_size,
- interpolation=interpolation)
- else:
- img = image_ops.resize_images_v2(img, image_size, method=interpolation)
- img.set_shape((image_size[0], image_size[1], num_channels))
- return img
diff --git a/tensorflow/python/keras/preprocessing/image_dataset_test.py b/tensorflow/python/keras/preprocessing/image_dataset_test.py
deleted file mode 100644
index 51f2dc6..0000000
--- a/tensorflow/python/keras/preprocessing/image_dataset_test.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for image_dataset."""
-
-import os
-import shutil
-
-import numpy as np
-
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.eager import def_function
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.preprocessing import image as image_preproc
-from tensorflow.python.keras.preprocessing import image_dataset
-from tensorflow.python.platform import test
-
-try:
- import PIL # pylint:disable=g-import-not-at-top
-except ImportError:
- PIL = None
-
-
-class ImageDatasetFromDirectoryTest(keras_parameterized.TestCase):
-
- def _get_images(self, count=16, color_mode='rgb'):
- width = height = 24
- imgs = []
- for _ in range(count):
- if color_mode == 'grayscale':
- img = np.random.randint(0, 256, size=(height, width, 1))
- elif color_mode == 'rgba':
- img = np.random.randint(0, 256, size=(height, width, 4))
- else:
- img = np.random.randint(0, 256, size=(height, width, 3))
- img = image_preproc.array_to_img(img)
- imgs.append(img)
- return imgs
-
- def _prepare_directory(self,
- num_classes=2,
- grayscale=False,
- nested_dirs=False,
- color_mode='rgb',
- count=16):
- # Get a unique temp directory
- temp_dir = os.path.join(self.get_temp_dir(), str(np.random.randint(1e6)))
- os.mkdir(temp_dir)
- self.addCleanup(shutil.rmtree, temp_dir)
-
- # Generate paths to class subdirectories
- paths = []
- for class_index in range(num_classes):
- class_directory = 'class_%s' % (class_index,)
- if nested_dirs:
- class_paths = [
- class_directory, os.path.join(class_directory, 'subfolder_1'),
- os.path.join(class_directory, 'subfolder_2'), os.path.join(
- class_directory, 'subfolder_1', 'sub-subfolder')
- ]
- else:
- class_paths = [class_directory]
- for path in class_paths:
- os.mkdir(os.path.join(temp_dir, path))
- paths += class_paths
-
- # Save images to the paths
- i = 0
- for img in self._get_images(color_mode=color_mode, count=count):
- path = paths[i % len(paths)]
- if color_mode == 'rgb':
- ext = 'jpg'
- else:
- ext = 'png'
- filename = os.path.join(path, 'image_%s.%s' % (i, ext))
- img.save(os.path.join(temp_dir, filename))
- i += 1
- return temp_dir
-
- def test_image_dataset_from_directory_standalone(self):
- # Test retrieving images without labels from a directory and its subdirs.
- if PIL is None:
- return # Skip test if PIL is not available.
-
- # Save a few extra images in the parent directory.
- directory = self._prepare_directory(count=7, num_classes=2)
- for i, img in enumerate(self._get_images(3)):
- filename = 'image_%s.jpg' % (i,)
- img.save(os.path.join(directory, filename))
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=5, image_size=(18, 18), labels=None)
- batch = next(iter(dataset))
- # We return plain images
- self.assertEqual(batch.shape, (5, 18, 18, 3))
- self.assertEqual(batch.dtype.name, 'float32')
- # Count samples
- batch_count = 0
- sample_count = 0
- for batch in dataset:
- batch_count += 1
- sample_count += batch.shape[0]
- self.assertEqual(batch_count, 2)
- self.assertEqual(sample_count, 10)
-
- def test_image_dataset_from_directory_binary(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=2)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode='int')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 3))
- self.assertEqual(batch[0].dtype.name, 'float32')
- self.assertEqual(batch[1].shape, (8,))
- self.assertEqual(batch[1].dtype.name, 'int32')
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode='binary')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 3))
- self.assertEqual(batch[0].dtype.name, 'float32')
- self.assertEqual(batch[1].shape, (8, 1))
- self.assertEqual(batch[1].dtype.name, 'float32')
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 3))
- self.assertEqual(batch[0].dtype.name, 'float32')
- self.assertEqual(batch[1].shape, (8, 2))
- self.assertEqual(batch[1].dtype.name, 'float32')
-
- def test_static_shape_in_graph(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=2)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode='int')
- test_case = self
-
- @def_function.function
- def symbolic_fn(ds):
- for x, _ in ds.take(1):
- test_case.assertListEqual(x.shape.as_list(), [None, 18, 18, 3])
-
- symbolic_fn(dataset)
-
- def test_sample_count(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=4, count=15)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode=None)
- sample_count = 0
- for batch in dataset:
- sample_count += batch.shape[0]
- self.assertEqual(sample_count, 15)
-
- def test_image_dataset_from_directory_multiclass(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=4, count=15)
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode=None)
- batch = next(iter(dataset))
- self.assertEqual(batch.shape, (8, 18, 18, 3))
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode=None)
- sample_count = 0
- iterator = iter(dataset)
- for batch in dataset:
- sample_count += next(iterator).shape[0]
- self.assertEqual(sample_count, 15)
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode='int')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 3))
- self.assertEqual(batch[0].dtype.name, 'float32')
- self.assertEqual(batch[1].shape, (8,))
- self.assertEqual(batch[1].dtype.name, 'int32')
-
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode='categorical')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 3))
- self.assertEqual(batch[0].dtype.name, 'float32')
- self.assertEqual(batch[1].shape, (8, 4))
- self.assertEqual(batch[1].dtype.name, 'float32')
-
- def test_image_dataset_from_directory_color_modes(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=4, color_mode='rgba')
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), color_mode='rgba')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 4))
- self.assertEqual(batch[0].dtype.name, 'float32')
-
- directory = self._prepare_directory(num_classes=4, color_mode='grayscale')
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), color_mode='grayscale')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 1))
- self.assertEqual(batch[0].dtype.name, 'float32')
-
- def test_image_dataset_from_directory_validation_split(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=2, count=10)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=10, image_size=(18, 18),
- validation_split=0.2, subset='training', seed=1337)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8, 18, 18, 3))
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=10, image_size=(18, 18),
- validation_split=0.2, subset='validation', seed=1337)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (2, 18, 18, 3))
-
- def test_image_dataset_from_directory_manual_labels(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=2, count=2)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18),
- labels=[0, 1], shuffle=False)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertAllClose(batch[1], [0, 1])
-
- def test_image_dataset_from_directory_follow_links(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=2, count=25,
- nested_dirs=True)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=8, image_size=(18, 18), label_mode=None,
- follow_links=True)
- sample_count = 0
- for batch in dataset:
- sample_count += batch.shape[0]
- self.assertEqual(sample_count, 25)
-
- def test_image_dataset_from_directory_no_images(self):
- directory = self._prepare_directory(num_classes=2, count=0)
- with self.assertRaisesRegex(ValueError, 'No images found.'):
- _ = image_dataset.image_dataset_from_directory(directory)
-
- def test_image_dataset_from_directory_crop_to_aspect_ratio(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=2, count=5)
- dataset = image_dataset.image_dataset_from_directory(
- directory, batch_size=5, image_size=(18, 18), crop_to_aspect_ratio=True)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (5, 18, 18, 3))
-
- def test_image_dataset_from_directory_errors(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- directory = self._prepare_directory(num_classes=3, count=5)
-
- with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
- _ = image_dataset.image_dataset_from_directory(
- directory, labels='other')
-
- with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
- _ = image_dataset.image_dataset_from_directory(
- directory, label_mode='other')
-
- with self.assertRaisesRegex(ValueError, '`color_mode` must be one of'):
- _ = image_dataset.image_dataset_from_directory(
- directory, color_mode='other')
-
- with self.assertRaisesRegex(
- ValueError, 'only pass `class_names` if the labels are inferred'):
- _ = image_dataset.image_dataset_from_directory(
- directory, labels=[0, 0, 1, 1, 1],
- class_names=['class_0', 'class_1', 'class_2'])
-
- with self.assertRaisesRegex(
- ValueError,
- 'Expected the lengths of `labels` to match the number of files'):
- _ = image_dataset.image_dataset_from_directory(
- directory, labels=[0, 0, 1, 1])
-
- with self.assertRaisesRegex(
- ValueError, '`class_names` passed did not match'):
- _ = image_dataset.image_dataset_from_directory(
- directory, class_names=['class_0', 'class_2'])
-
- with self.assertRaisesRegex(ValueError, 'there must exactly 2 classes'):
- _ = image_dataset.image_dataset_from_directory(
- directory, label_mode='binary')
-
- with self.assertRaisesRegex(ValueError,
- '`validation_split` must be between 0 and 1'):
- _ = image_dataset.image_dataset_from_directory(
- directory, validation_split=2)
-
- with self.assertRaisesRegex(ValueError,
- '`subset` must be either "training" or'):
- _ = image_dataset.image_dataset_from_directory(
- directory, validation_split=0.2, subset='other')
-
- with self.assertRaisesRegex(ValueError, '`validation_split` must be set'):
- _ = image_dataset.image_dataset_from_directory(
- directory, validation_split=0, subset='training')
-
- with self.assertRaisesRegex(ValueError, 'must provide a `seed`'):
- _ = image_dataset.image_dataset_from_directory(
- directory, validation_split=0.2, subset='training')
-
-
-if __name__ == '__main__':
- v2_compat.enable_v2_behavior()
- test.main()
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
deleted file mode 100644
index 464f166..0000000
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ /dev/null
@@ -1,449 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for image preprocessing utils."""
-
-import os
-import shutil
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import layers
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.preprocessing import image as preprocessing_image
-from tensorflow.python.platform import test
-
-try:
- import PIL # pylint:disable=g-import-not-at-top
-except ImportError:
- PIL = None
-
-
-def _generate_test_images():
- img_w = img_h = 20
- rgb_images = []
- gray_images = []
- for _ in range(8):
- bias = np.random.rand(img_w, img_h, 1) * 64
- variance = np.random.rand(img_w, img_h, 1) * (255 - 64)
- imarray = np.random.rand(img_w, img_h, 3) * variance + bias
- im = preprocessing_image.array_to_img(imarray, scale=False)
- rgb_images.append(im)
-
- imarray = np.random.rand(img_w, img_h, 1) * variance + bias
- im = preprocessing_image.array_to_img(imarray, scale=False)
- gray_images.append(im)
-
- return [rgb_images, gray_images]
-
-
-class TestImage(keras_parameterized.TestCase):
-
- def test_smart_resize(self):
- test_input = np.random.random((20, 40, 3))
- output = preprocessing_image.smart_resize(test_input, size=(50, 50))
- self.assertIsInstance(output, np.ndarray)
- self.assertListEqual(list(output.shape), [50, 50, 3])
- output = preprocessing_image.smart_resize(test_input, size=(10, 10))
- self.assertListEqual(list(output.shape), [10, 10, 3])
- output = preprocessing_image.smart_resize(test_input, size=(100, 50))
- self.assertListEqual(list(output.shape), [100, 50, 3])
- output = preprocessing_image.smart_resize(test_input, size=(5, 15))
- self.assertListEqual(list(output.shape), [5, 15, 3])
-
- @parameterized.named_parameters(
- ('size1', (50, 50)),
- ('size2', (10, 10)),
- ('size3', (100, 50)),
- ('size4', (5, 15)))
- def test_smart_resize_tf_dataset(self, size):
- test_input_np = np.random.random((2, 20, 40, 3))
- test_ds = dataset_ops.Dataset.from_tensor_slices(test_input_np)
-
- resize = lambda img: preprocessing_image.smart_resize(img, size=size)
- test_ds = test_ds.map(resize)
- for sample in test_ds.as_numpy_iterator():
- self.assertIsInstance(sample, np.ndarray)
- self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
-
- def test_smart_resize_batch(self):
- img = np.random.random((2, 20, 40, 3))
- out = preprocessing_image.smart_resize(img, size=(20, 20))
- self.assertListEqual(list(out.shape), [2, 20, 20, 3])
- self.assertAllClose(out, img[:, :, 10:-10, :])
-
- def test_smart_resize_errors(self):
- with self.assertRaisesRegex(ValueError, 'a tuple of 2 integers'):
- preprocessing_image.smart_resize(
- np.random.random((20, 20, 2)), size=(10, 5, 3))
- with self.assertRaisesRegex(ValueError, 'incorrect rank'):
- preprocessing_image.smart_resize(np.random.random((2, 4)), size=(10, 5))
- with self.assertRaisesRegex(ValueError, 'incorrect rank'):
- preprocessing_image.smart_resize(
- np.random.random((2, 4, 4, 5, 3)), size=(10, 5))
-
- def test_image_data_generator(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- for test_images in _generate_test_images():
- img_list = []
- for im in test_images:
- img_list.append(preprocessing_image.img_to_array(im)[None, ...])
-
- images = np.vstack(img_list)
- generator = preprocessing_image.ImageDataGenerator(
- featurewise_center=True,
- samplewise_center=True,
- featurewise_std_normalization=True,
- samplewise_std_normalization=True,
- zca_whitening=True,
- rotation_range=90.,
- width_shift_range=0.1,
- height_shift_range=0.1,
- shear_range=0.5,
- zoom_range=0.2,
- channel_shift_range=0.,
- brightness_range=(1, 5),
- fill_mode='nearest',
- cval=0.5,
- horizontal_flip=True,
- vertical_flip=True)
- # Basic test before fit
- x = np.random.random((32, 10, 10, 3))
- generator.flow(x)
-
- # Fit
- generator.fit(images, augment=True)
-
- for x, _ in generator.flow(
- images,
- np.arange(images.shape[0]),
- shuffle=True):
- self.assertEqual(x.shape[1:], images.shape[1:])
- break
-
- def test_image_data_generator_with_split_value_error(self):
- with self.assertRaises(ValueError):
- preprocessing_image.ImageDataGenerator(validation_split=5)
-
- def test_image_data_generator_invalid_data(self):
- generator = preprocessing_image.ImageDataGenerator(
- featurewise_center=True,
- samplewise_center=True,
- featurewise_std_normalization=True,
- samplewise_std_normalization=True,
- zca_whitening=True,
- data_format='channels_last')
-
- # Test fit with invalid data
- with self.assertRaises(ValueError):
- x = np.random.random((3, 10, 10))
- generator.fit(x)
- # Test flow with invalid data
- with self.assertRaises(ValueError):
- generator.flow(np.arange(5))
- # Invalid number of channels: will work but raise a warning
- x = np.random.random((32, 10, 10, 5))
- generator.flow(x)
-
- with self.assertRaises(ValueError):
- generator = preprocessing_image.ImageDataGenerator(
- data_format='unknown')
-
- generator = preprocessing_image.ImageDataGenerator(zoom_range=(2., 2.))
-
- def test_image_data_generator_fit(self):
- generator = preprocessing_image.ImageDataGenerator(
- featurewise_center=True,
- samplewise_center=True,
- featurewise_std_normalization=True,
- samplewise_std_normalization=True,
- zca_whitening=True,
- data_format='channels_last')
- # Test grayscale
- x = np.random.random((32, 10, 10, 1))
- generator.fit(x)
- # Test RBG
- x = np.random.random((32, 10, 10, 3))
- generator.fit(x)
- generator = preprocessing_image.ImageDataGenerator(
- featurewise_center=True,
- samplewise_center=True,
- featurewise_std_normalization=True,
- samplewise_std_normalization=True,
- zca_whitening=True,
- data_format='channels_first')
- # Test grayscale
- x = np.random.random((32, 1, 10, 10))
- generator.fit(x)
- # Test RBG
- x = np.random.random((32, 3, 10, 10))
- generator.fit(x)
-
- def test_directory_iterator(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- num_classes = 2
-
- temp_dir = self.get_temp_dir()
- self.addCleanup(shutil.rmtree, temp_dir)
-
- # create folders and subfolders
- paths = []
- for cl in range(num_classes):
- class_directory = 'class-{}'.format(cl)
- classpaths = [
- class_directory, os.path.join(class_directory, 'subfolder-1'),
- os.path.join(class_directory, 'subfolder-2'), os.path.join(
- class_directory, 'subfolder-1', 'sub-subfolder')
- ]
- for path in classpaths:
- os.mkdir(os.path.join(temp_dir, path))
- paths.append(classpaths)
-
- # save the images in the paths
- count = 0
- filenames = []
- for test_images in _generate_test_images():
- for im in test_images:
- # rotate image class
- im_class = count % num_classes
- # rotate subfolders
- classpaths = paths[im_class]
- filename = os.path.join(classpaths[count % len(classpaths)],
- 'image-{}.jpg'.format(count))
- filenames.append(filename)
- im.save(os.path.join(temp_dir, filename))
- count += 1
-
- # Test image loading util
- fname = os.path.join(temp_dir, filenames[0])
- _ = preprocessing_image.load_img(fname)
- _ = preprocessing_image.load_img(fname, grayscale=True)
- _ = preprocessing_image.load_img(fname, target_size=(10, 10))
- _ = preprocessing_image.load_img(fname, target_size=(10, 10),
- interpolation='bilinear')
-
- # create iterator
- generator = preprocessing_image.ImageDataGenerator()
- dir_iterator = generator.flow_from_directory(temp_dir)
-
- # check number of classes and images
- self.assertEqual(len(dir_iterator.class_indices), num_classes)
- self.assertEqual(len(dir_iterator.classes), count)
- self.assertEqual(set(dir_iterator.filenames), set(filenames))
-
- def preprocessing_function(x):
- """This will fail if not provided by a Numpy array.
-
- Note: This is made to enforce backward compatibility.
-
- Args:
- x: A numpy array.
-
- Returns:
- An array of zeros with the same shape as the given array.
- """
- self.assertEqual(x.shape, (26, 26, 3))
- self.assertIs(type(x), np.ndarray)
- return np.zeros_like(x)
-
- # Test usage as Sequence
- generator = preprocessing_image.ImageDataGenerator(
- preprocessing_function=preprocessing_function)
- dir_seq = generator.flow_from_directory(
- str(temp_dir),
- target_size=(26, 26),
- color_mode='rgb',
- batch_size=3,
- class_mode='categorical')
- self.assertEqual(len(dir_seq), count // 3 + 1)
- x1, y1 = dir_seq[1]
- self.assertEqual(x1.shape, (3, 26, 26, 3))
- self.assertEqual(y1.shape, (3, num_classes))
- x1, y1 = dir_seq[5]
- self.assertTrue((x1 == 0).all())
-
- def directory_iterator_with_validation_split_test_helper(
- self, validation_split):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- num_classes = 2
- tmp_folder = tempfile.mkdtemp(prefix='test_images')
-
- # create folders and subfolders
- paths = []
- for cl in range(num_classes):
- class_directory = 'class-{}'.format(cl)
- classpaths = [
- class_directory,
- os.path.join(class_directory, 'subfolder-1'),
- os.path.join(class_directory, 'subfolder-2'),
- os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
- ]
- for path in classpaths:
- os.mkdir(os.path.join(tmp_folder, path))
- paths.append(classpaths)
-
- # save the images in the paths
- count = 0
- filenames = []
- for test_images in _generate_test_images():
- for im in test_images:
- # rotate image class
- im_class = count % num_classes
- # rotate subfolders
- classpaths = paths[im_class]
- filename = os.path.join(classpaths[count % len(classpaths)],
- 'image-{}.jpg'.format(count))
- filenames.append(filename)
- im.save(os.path.join(tmp_folder, filename))
- count += 1
-
- # create iterator
- generator = preprocessing_image.ImageDataGenerator(
- validation_split=validation_split)
-
- with self.assertRaises(ValueError):
- generator.flow_from_directory(tmp_folder, subset='foo')
-
- num_validation = int(count * validation_split)
- num_training = count - num_validation
- train_iterator = generator.flow_from_directory(
- tmp_folder, subset='training')
- self.assertEqual(train_iterator.samples, num_training)
-
- valid_iterator = generator.flow_from_directory(
- tmp_folder, subset='validation')
- self.assertEqual(valid_iterator.samples, num_validation)
-
- # check number of classes and images
- self.assertEqual(len(train_iterator.class_indices), num_classes)
- self.assertEqual(len(train_iterator.classes), num_training)
- self.assertEqual(
- len(set(train_iterator.filenames) & set(filenames)), num_training)
-
- model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
- model.compile(optimizer='sgd', loss='mse')
- model.fit(train_iterator, epochs=1)
-
- shutil.rmtree(tmp_folder)
-
- @keras_parameterized.run_all_keras_modes
- def test_directory_iterator_with_validation_split_25_percent(self):
- self.directory_iterator_with_validation_split_test_helper(0.25)
-
- @keras_parameterized.run_all_keras_modes
- def test_directory_iterator_with_validation_split_40_percent(self):
- self.directory_iterator_with_validation_split_test_helper(0.40)
-
- @keras_parameterized.run_all_keras_modes
- def test_directory_iterator_with_validation_split_50_percent(self):
- self.directory_iterator_with_validation_split_test_helper(0.50)
-
- def test_img_utils(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- height, width = 10, 8
-
- # Test channels_first data format
- x = np.random.random((3, height, width))
- img = preprocessing_image.array_to_img(
- x, data_format='channels_first')
- self.assertEqual(img.size, (width, height))
- x = preprocessing_image.img_to_array(
- img, data_format='channels_first')
- self.assertEqual(x.shape, (3, height, width))
- # Test 2D
- x = np.random.random((1, height, width))
- img = preprocessing_image.array_to_img(
- x, data_format='channels_first')
- self.assertEqual(img.size, (width, height))
- x = preprocessing_image.img_to_array(
- img, data_format='channels_first')
- self.assertEqual(x.shape, (1, height, width))
-
- # Test channels_last data format
- x = np.random.random((height, width, 3))
- img = preprocessing_image.array_to_img(x, data_format='channels_last')
- self.assertEqual(img.size, (width, height))
- x = preprocessing_image.img_to_array(img, data_format='channels_last')
- self.assertEqual(x.shape, (height, width, 3))
- # Test 2D
- x = np.random.random((height, width, 1))
- img = preprocessing_image.array_to_img(x, data_format='channels_last')
- self.assertEqual(img.size, (width, height))
- x = preprocessing_image.img_to_array(img, data_format='channels_last')
- self.assertEqual(x.shape, (height, width, 1))
-
- def test_batch_standardize(self):
- if PIL is None:
- return # Skip test if PIL is not available.
-
- # ImageDataGenerator.standardize should work on batches
- for test_images in _generate_test_images():
- img_list = []
- for im in test_images:
- img_list.append(preprocessing_image.img_to_array(im)[None, ...])
-
- images = np.vstack(img_list)
- generator = preprocessing_image.ImageDataGenerator(
- featurewise_center=True,
- samplewise_center=True,
- featurewise_std_normalization=True,
- samplewise_std_normalization=True,
- zca_whitening=True,
- rotation_range=90.,
- width_shift_range=0.1,
- height_shift_range=0.1,
- shear_range=0.5,
- zoom_range=0.2,
- channel_shift_range=0.,
- brightness_range=(1, 5),
- fill_mode='nearest',
- cval=0.5,
- horizontal_flip=True,
- vertical_flip=True)
- generator.fit(images, augment=True)
-
- transformed = np.copy(images)
- for i, im in enumerate(transformed):
- transformed[i] = generator.random_transform(im)
- transformed = generator.standardize(transformed)
-
- def test_img_transforms(self):
- x = np.random.random((3, 200, 200))
- _ = preprocessing_image.random_rotation(x, 20)
- _ = preprocessing_image.random_shift(x, 0.2, 0.2)
- _ = preprocessing_image.random_shear(x, 2.)
- _ = preprocessing_image.random_zoom(x, (0.5, 0.5))
- _ = preprocessing_image.apply_channel_shift(x, 2, 2)
- _ = preprocessing_image.apply_affine_transform(x, 2)
- with self.assertRaises(ValueError):
- preprocessing_image.random_zoom(x, (0, 0, 0))
- _ = preprocessing_image.random_channel_shift(x, 2.)
-
-
-if __name__ == '__main__':
- v2_compat.enable_v2_behavior()
- test.main()
diff --git a/tensorflow/python/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py
deleted file mode 100644
index 51a4110..0000000
--- a/tensorflow/python/keras/preprocessing/sequence.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for preprocessing sequence data."""
-# pylint: disable=invalid-name
-
-from keras_preprocessing import sequence
-
-from tensorflow.python.keras.utils import data_utils
-from tensorflow.python.util.tf_export import keras_export
-
-make_sampling_table = sequence.make_sampling_table
-skipgrams = sequence.skipgrams
-# TODO(fchollet): consider making `_remove_long_seq` public.
-_remove_long_seq = sequence._remove_long_seq # pylint: disable=protected-access
-
-
-@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
-class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence):
- """Utility class for generating batches of temporal data.
-
- This class takes in a sequence of data-points gathered at
- equal intervals, along with time series parameters such as
- stride, length of history, etc., to produce batches for
- training/validation.
- # Arguments
- data: Indexable generator (such as list or Numpy array)
- containing consecutive data points (timesteps).
- The data should be at 2D, and axis 0 is expected
- to be the time dimension.
- targets: Targets corresponding to timesteps in `data`.
- It should have same length as `data`.
- length: Length of the output sequences (in number of timesteps).
- sampling_rate: Period between successive individual timesteps
- within sequences. For rate `r`, timesteps
- `data[i]`, `data[i-r]`, ... `data[i - length]`
- are used for create a sample sequence.
- stride: Period between successive output sequences.
- For stride `s`, consecutive output samples would
- be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
- start_index: Data points earlier than `start_index` will not be used
- in the output sequences. This is useful to reserve part of the
- data for test or validation.
- end_index: Data points later than `end_index` will not be used
- in the output sequences. This is useful to reserve part of the
- data for test or validation.
- shuffle: Whether to shuffle output samples,
- or instead draw them in chronological order.
- reverse: Boolean: if `true`, timesteps in each output sample will be
- in reverse chronological order.
- batch_size: Number of timeseries samples in each batch
- (except maybe the last one).
- # Returns
- A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance.
- # Examples
- ```python
- from keras.preprocessing.sequence import TimeseriesGenerator
- import numpy as np
- data = np.array([[i] for i in range(50)])
- targets = np.array([[i] for i in range(50)])
- data_gen = TimeseriesGenerator(data, targets,
- length=10, sampling_rate=2,
- batch_size=2)
- assert len(data_gen) == 20
- batch_0 = data_gen[0]
- x, y = batch_0
- assert np.array_equal(x,
- np.array([[[0], [2], [4], [6], [8]],
- [[1], [3], [5], [7], [9]]]))
- assert np.array_equal(y,
- np.array([[10], [11]]))
- ```
- """
- pass
-
-
-@keras_export('keras.preprocessing.sequence.pad_sequences')
-def pad_sequences(sequences, maxlen=None, dtype='int32',
- padding='pre', truncating='pre', value=0.):
- """Pads sequences to the same length.
-
- This function transforms a list (of length `num_samples`)
- of sequences (lists of integers)
- into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
- `num_timesteps` is either the `maxlen` argument if provided,
- or the length of the longest sequence in the list.
-
- Sequences that are shorter than `num_timesteps`
- are padded with `value` until they are `num_timesteps` long.
-
- Sequences longer than `num_timesteps` are truncated
- so that they fit the desired length.
-
- The position where padding or truncation happens is determined by
- the arguments `padding` and `truncating`, respectively.
- Pre-padding or removing values from the beginning of the sequence is the
- default.
-
- >>> sequence = [[1], [2, 3], [4, 5, 6]]
- >>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
- array([[0, 0, 1],
- [0, 2, 3],
- [4, 5, 6]], dtype=int32)
-
- >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
- array([[-1, -1, 1],
- [-1, 2, 3],
- [ 4, 5, 6]], dtype=int32)
-
- >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
- array([[1, 0, 0],
- [2, 3, 0],
- [4, 5, 6]], dtype=int32)
-
- >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
- array([[0, 1],
- [2, 3],
- [5, 6]], dtype=int32)
-
- Args:
- sequences: List of sequences (each sequence is a list of integers).
- maxlen: Optional Int, maximum length of all sequences. If not provided,
- sequences will be padded to the length of the longest individual
- sequence.
- dtype: (Optional, defaults to int32). Type of the output sequences.
- To pad sequences with variable length strings, you can use `object`.
- padding: String, 'pre' or 'post' (optional, defaults to 'pre'):
- pad either before or after each sequence.
- truncating: String, 'pre' or 'post' (optional, defaults to 'pre'):
- remove values from sequences larger than
- `maxlen`, either at the beginning or at the end of the sequences.
- value: Float or String, padding value. (Optional, defaults to 0.)
-
- Returns:
- Numpy array with shape `(len(sequences), maxlen)`
-
- Raises:
- ValueError: In case of invalid values for `truncating` or `padding`,
- or in case of invalid shape for a `sequences` entry.
- """
- return sequence.pad_sequences(
- sequences, maxlen=maxlen, dtype=dtype,
- padding=padding, truncating=truncating, value=value)
-
-keras_export(
- 'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
-keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
diff --git a/tensorflow/python/keras/preprocessing/sequence_test.py b/tensorflow/python/keras/preprocessing/sequence_test.py
deleted file mode 100644
index eeb84b6..0000000
--- a/tensorflow/python/keras/preprocessing/sequence_test.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for sequence data preprocessing utils."""
-
-from math import ceil
-
-import numpy as np
-
-from tensorflow.python.keras.preprocessing import sequence as preprocessing_sequence
-from tensorflow.python.platform import test
-
-
-class TestSequence(test.TestCase):
-
- def test_pad_sequences(self):
- a = [[1], [1, 2], [1, 2, 3]]
-
- # test padding
- b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='pre')
- self.assertAllClose(b, [[0, 0, 1], [0, 1, 2], [1, 2, 3]])
- b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='post')
- self.assertAllClose(b, [[1, 0, 0], [1, 2, 0], [1, 2, 3]])
-
- # test truncating
- b = preprocessing_sequence.pad_sequences(
- a, maxlen=2, truncating='pre')
- self.assertAllClose(b, [[0, 1], [1, 2], [2, 3]])
- b = preprocessing_sequence.pad_sequences(
- a, maxlen=2, truncating='post')
- self.assertAllClose(b, [[0, 1], [1, 2], [1, 2]])
-
- # test value
- b = preprocessing_sequence.pad_sequences(a, maxlen=3, value=1)
- self.assertAllClose(b, [[1, 1, 1], [1, 1, 2], [1, 2, 3]])
-
- def test_pad_sequences_vector(self):
- a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
-
- # test padding
- b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='pre')
- self.assertAllClose(b, [[[0, 0], [0, 0], [1, 1]], [[0, 0], [2, 1], [2, 2]],
- [[3, 1], [3, 2], [3, 3]]])
- b = preprocessing_sequence.pad_sequences(a, maxlen=3, padding='post')
- self.assertAllClose(b, [[[1, 1], [0, 0], [0, 0]], [[2, 1], [2, 2], [0, 0]],
- [[3, 1], [3, 2], [3, 3]]])
-
- # test truncating
- b = preprocessing_sequence.pad_sequences(
- a, maxlen=2, truncating='pre')
- self.assertAllClose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 2], [3,
- 3]]])
-
- b = preprocessing_sequence.pad_sequences(
- a, maxlen=2, truncating='post')
- self.assertAllClose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 1], [3,
- 2]]])
-
- # test value
- b = preprocessing_sequence.pad_sequences(a, maxlen=3, value=1)
- self.assertAllClose(b, [[[1, 1], [1, 1], [1, 1]], [[1, 1], [2, 1], [2, 2]],
- [[3, 1], [3, 2], [3, 3]]])
-
- def test_make_sampling_table(self):
- a = preprocessing_sequence.make_sampling_table(3)
- self.assertAllClose(
- a, np.asarray([0.00315225, 0.00315225, 0.00547597]), rtol=.1)
-
- def test_skipgrams(self):
- # test with no window size and binary labels
- couples, labels = preprocessing_sequence.skipgrams(
- np.arange(3), vocabulary_size=3)
- for couple in couples:
- self.assertIn(couple[0], [0, 1, 2])
- self.assertIn(couple[1], [0, 1, 2])
-
- # test window size and categorical labels
- couples, labels = preprocessing_sequence.skipgrams(
- np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
- for couple in couples:
- self.assertLessEqual(couple[0] - couple[1], 3)
- for l in labels:
- self.assertEqual(len(l), 2)
-
- def test_remove_long_seq(self):
- a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]
-
- new_seq, new_label = preprocessing_sequence._remove_long_seq(
- maxlen=3, seq=a, label=['a', 'b', ['c', 'd']])
- self.assertEqual(new_seq, [[[1, 1]], [[2, 1], [2, 2]]])
- self.assertEqual(new_label, ['a', 'b'])
-
- def test_TimeseriesGenerator(self):
- data = np.array([[i] for i in range(50)])
- targets = np.array([[i] for i in range(50)])
-
- data_gen = preprocessing_sequence.TimeseriesGenerator(
- data, targets, length=10, sampling_rate=2, batch_size=2)
- self.assertEqual(len(data_gen), 20)
- self.assertAllClose(data_gen[0][0],
- np.array([[[0], [2], [4], [6], [8]], [[1], [3], [5],
- [7], [9]]]))
- self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
- self.assertAllClose(data_gen[1][0],
- np.array([[[2], [4], [6], [8], [10]], [[3], [5], [7],
- [9], [11]]]))
- self.assertAllClose(data_gen[1][1], np.array([[12], [13]]))
-
- data_gen = preprocessing_sequence.TimeseriesGenerator(
- data, targets, length=10, sampling_rate=2, reverse=True, batch_size=2)
- self.assertEqual(len(data_gen), 20)
- self.assertAllClose(data_gen[0][0],
- np.array([[[8], [6], [4], [2], [0]], [[9], [7], [5],
- [3], [1]]]))
- self.assertAllClose(data_gen[0][1], np.array([[10], [11]]))
-
- data_gen = preprocessing_sequence.TimeseriesGenerator(
- data, targets, length=10, sampling_rate=2, shuffle=True, batch_size=1)
- batch = data_gen[0]
- r = batch[1][0][0]
- self.assertAllClose(batch[0],
- np.array([[[r - 10], [r - 8], [r - 6], [r - 4],
- [r - 2]]]))
- self.assertAllClose(batch[1], np.array([
- [r],
- ]))
-
- data_gen = preprocessing_sequence.TimeseriesGenerator(
- data, targets, length=10, sampling_rate=2, stride=2, batch_size=2)
- self.assertEqual(len(data_gen), 10)
- self.assertAllClose(data_gen[1][0],
- np.array([[[4], [6], [8], [10], [12]], [[6], [8], [10],
- [12], [14]]]))
- self.assertAllClose(data_gen[1][1], np.array([[14], [16]]))
-
- data_gen = preprocessing_sequence.TimeseriesGenerator(
- data,
- targets,
- length=10,
- sampling_rate=2,
- start_index=10,
- end_index=30,
- batch_size=2)
- self.assertEqual(len(data_gen), 6)
- self.assertAllClose(data_gen[0][0],
- np.array([[[10], [12], [14], [16], [18]],
- [[11], [13], [15], [17], [19]]]))
- self.assertAllClose(data_gen[0][1], np.array([[20], [21]]))
-
- data = np.array([np.random.random_sample((1, 2, 3, 4)) for i in range(50)])
- targets = np.array([np.random.random_sample((3, 2, 1)) for i in range(50)])
- data_gen = preprocessing_sequence.TimeseriesGenerator(
- data,
- targets,
- length=10,
- sampling_rate=2,
- start_index=10,
- end_index=30,
- batch_size=2)
-
- self.assertEqual(len(data_gen), 6)
- self.assertAllClose(data_gen[0][0],
- np.array(
- [np.array(data[10:19:2]),
- np.array(data[11:20:2])]))
- self.assertAllClose(data_gen[0][1], np.array([targets[20], targets[21]]))
-
- with self.assertRaises(ValueError) as context:
- preprocessing_sequence.TimeseriesGenerator(data, targets, length=50)
- error = str(context.exception)
- self.assertIn('`start_index+length=50 > end_index=49` is disallowed', error)
-
- def test_TimeSeriesGenerator_doesnt_miss_any_sample(self):
- x = np.array([[i] for i in range(10)])
-
- for length in range(3, 10):
- g = preprocessing_sequence.TimeseriesGenerator(
- x, x, length=length, batch_size=1)
- expected = max(0, len(x) - length)
- actual = len(g)
- self.assertEqual(expected, actual)
-
- if actual > 0:
- # All elements in range(length, 10) should be used as current step
- expected = np.arange(length, 10).reshape(-1, 1)
-
- y = np.concatenate([g[ix][1] for ix in range(len(g))], axis=0)
- self.assertAllClose(y, expected)
-
- x = np.array([[i] for i in range(23)])
-
- strides = (1, 1, 5, 7, 3, 5, 3)
- lengths = (3, 3, 4, 3, 1, 3, 7)
- batch_sizes = (6, 6, 6, 5, 6, 6, 6)
- shuffles = (False, True, True, False, False, False, False)
-
- for stride, length, batch_size, shuffle in zip(strides, lengths,
- batch_sizes, shuffles):
- g = preprocessing_sequence.TimeseriesGenerator(
- x,
- x,
- length=length,
- sampling_rate=1,
- stride=stride,
- start_index=0,
- end_index=None,
- shuffle=shuffle,
- reverse=False,
- batch_size=batch_size)
- if shuffle:
- # all batches have the same size when shuffle is True.
- expected_sequences = ceil(
- (23 - length) / float(batch_size * stride)) * batch_size
- else:
- # last batch will be different if `(samples - length) / stride`
- # is not a multiple of `batch_size`.
- expected_sequences = ceil((23 - length) / float(stride))
-
- expected_batches = ceil(expected_sequences / float(batch_size))
-
- y = [g[ix][1] for ix in range(len(g))]
-
- actual_sequences = sum(len(iy) for iy in y)
- actual_batches = len(y)
-
- self.assertEqual(expected_sequences, actual_sequences)
- self.assertEqual(expected_batches, actual_batches)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
deleted file mode 100644
index d36a9a5..0000000
--- a/tensorflow/python/keras/preprocessing/text.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for text input preprocessing."""
-# pylint: disable=invalid-name
-
-from keras_preprocessing import text
-
-from tensorflow.python.keras.preprocessing.text_dataset import text_dataset_from_directory # pylint: disable=unused-import
-from tensorflow.python.util.tf_export import keras_export
-
-hashing_trick = text.hashing_trick
-Tokenizer = text.Tokenizer
-
-
-@keras_export('keras.preprocessing.text.text_to_word_sequence')
-def text_to_word_sequence(input_text,
- filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
- lower=True,
- split=' '):
- """Converts a text to a sequence of words (or tokens).
-
- This function transforms a string of text into a list of words
- while ignoring `filters` which include punctuations by default.
-
- >>> sample_text = 'This is a sample sentence.'
- >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
- ['this', 'is', 'a', 'sample', 'sentence']
-
- Args:
- input_text: Input text (string).
- filters: list (or concatenation) of characters to filter out, such as
- punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
- includes basic punctuation, tabs, and newlines.
- lower: boolean. Whether to convert the input to lowercase.
- split: str. Separator for word splitting.
-
- Returns:
- A list of words (or tokens).
- """
- return text.text_to_word_sequence(
- input_text, filters=filters, lower=lower, split=split)
-
-
-@keras_export('keras.preprocessing.text.one_hot')
-def one_hot(input_text,
- n,
- filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
- lower=True,
- split=' '):
- r"""One-hot encodes a text into a list of word indexes of size `n`.
-
- This function receives as input a string of text and returns a
- list of encoded integers each corresponding to a word (or token)
- in the given input string.
-
- Args:
- input_text: Input text (string).
- n: int. Size of vocabulary.
- filters: list (or concatenation) of characters to filter out, such as
- punctuation. Default:
- ```
- '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
- ```,
- includes basic punctuation, tabs, and newlines.
- lower: boolean. Whether to set the text to lowercase.
- split: str. Separator for word splitting.
-
- Returns:
- List of integers in `[1, n]`. Each integer encodes a word
- (unicity non-guaranteed).
- """
- return text.one_hot(input_text, n, filters=filters, lower=lower, split=split)
-
-
-# text.tokenizer_from_json is only available if keras_preprocessing >= 1.1.0
-try:
- tokenizer_from_json = text.tokenizer_from_json
- keras_export('keras.preprocessing.text.tokenizer_from_json')(
- tokenizer_from_json)
-except AttributeError:
- pass
-
-keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
-keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
diff --git a/tensorflow/python/keras/preprocessing/text_dataset.py b/tensorflow/python/keras/preprocessing/text_dataset.py
deleted file mode 100644
index f29a7f6..0000000
--- a/tensorflow/python/keras/preprocessing/text_dataset.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras text dataset generation utilities."""
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.keras.preprocessing import dataset_utils
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.utils.text_dataset_from_directory',
- 'keras.preprocessing.text_dataset_from_directory',
- v1=[])
-def text_dataset_from_directory(directory,
- labels='inferred',
- label_mode='int',
- class_names=None,
- batch_size=32,
- max_length=None,
- shuffle=True,
- seed=None,
- validation_split=None,
- subset=None,
- follow_links=False):
- """Generates a `tf.data.Dataset` from text files in a directory.
-
- If your directory structure is:
-
- ```
- main_directory/
- ...class_a/
- ......a_text_1.txt
- ......a_text_2.txt
- ...class_b/
- ......b_text_1.txt
- ......b_text_2.txt
- ```
-
- Then calling `text_dataset_from_directory(main_directory, labels='inferred')`
- will return a `tf.data.Dataset` that yields batches of texts from
- the subdirectories `class_a` and `class_b`, together with labels
- 0 and 1 (0 corresponding to `class_a` and 1 corresponding to `class_b`).
-
- Only `.txt` files are supported at this time.
-
- Args:
- directory: Directory where the data is located.
- If `labels` is "inferred", it should contain
- subdirectories, each containing text files for a class.
- Otherwise, the directory structure is ignored.
- labels: Either "inferred"
- (labels are generated from the directory structure),
- None (no labels),
- or a list/tuple of integer labels of the same size as the number of
- text files found in the directory. Labels should be sorted according
- to the alphanumeric order of the text file paths
- (obtained via `os.walk(directory)` in Python).
- label_mode:
- - 'int': means that the labels are encoded as integers
- (e.g. for `sparse_categorical_crossentropy` loss).
- - 'categorical' means that the labels are
- encoded as a categorical vector
- (e.g. for `categorical_crossentropy` loss).
- - 'binary' means that the labels (there can be only 2)
- are encoded as `float32` scalars with values 0 or 1
- (e.g. for `binary_crossentropy`).
- - None (no labels).
- class_names: Only valid if "labels" is "inferred". This is the explict
- list of class names (must match names of subdirectories). Used
- to control the order of the classes
- (otherwise alphanumerical order is used).
- batch_size: Size of the batches of data. Default: 32.
- max_length: Maximum size of a text string. Texts longer than this will
- be truncated to `max_length`.
- shuffle: Whether to shuffle the data. Default: True.
- If set to False, sorts the data in alphanumeric order.
- seed: Optional random seed for shuffling and transformations.
- validation_split: Optional float between 0 and 1,
- fraction of data to reserve for validation.
- subset: One of "training" or "validation".
- Only used if `validation_split` is set.
- follow_links: Whether to visits subdirectories pointed to by symlinks.
- Defaults to False.
-
- Returns:
- A `tf.data.Dataset` object.
- - If `label_mode` is None, it yields `string` tensors of shape
- `(batch_size,)`, containing the contents of a batch of text files.
- - Otherwise, it yields a tuple `(texts, labels)`, where `texts`
- has shape `(batch_size,)` and `labels` follows the format described
- below.
-
- Rules regarding labels format:
- - if `label_mode` is `int`, the labels are an `int32` tensor of shape
- `(batch_size,)`.
- - if `label_mode` is `binary`, the labels are a `float32` tensor of
- 1s and 0s of shape `(batch_size, 1)`.
- - if `label_mode` is `categorial`, the labels are a `float32` tensor
- of shape `(batch_size, num_classes)`, representing a one-hot
- encoding of the class index.
- """
- if labels not in ('inferred', None):
- if not isinstance(labels, (list, tuple)):
- raise ValueError(
- '`labels` argument should be a list/tuple of integer labels, of '
- 'the same size as the number of text files in the target '
- 'directory. If you wish to infer the labels from the subdirectory '
- 'names in the target directory, pass `labels="inferred"`. '
- 'If you wish to get a dataset that only contains text samples '
- '(no labels), pass `labels=None`.')
- if class_names:
- raise ValueError('You can only pass `class_names` if the labels are '
- 'inferred from the subdirectory names in the target '
- 'directory (`labels="inferred"`).')
- if label_mode not in {'int', 'categorical', 'binary', None}:
- raise ValueError(
- '`label_mode` argument must be one of "int", "categorical", "binary", '
- 'or None. Received: %s' % (label_mode,))
- if labels is None or label_mode is None:
- labels = None
- label_mode = None
- dataset_utils.check_validation_split_arg(
- validation_split, subset, shuffle, seed)
-
- if seed is None:
- seed = np.random.randint(1e6)
- file_paths, labels, class_names = dataset_utils.index_directory(
- directory,
- labels,
- formats=('.txt',),
- class_names=class_names,
- shuffle=shuffle,
- seed=seed,
- follow_links=follow_links)
-
- if label_mode == 'binary' and len(class_names) != 2:
- raise ValueError(
- 'When passing `label_mode="binary", there must exactly 2 classes. '
- 'Found the following classes: %s' % (class_names,))
-
- file_paths, labels = dataset_utils.get_training_or_validation_split(
- file_paths, labels, validation_split, subset)
- if not file_paths:
- raise ValueError('No text files found.')
-
- dataset = paths_and_labels_to_dataset(
- file_paths=file_paths,
- labels=labels,
- label_mode=label_mode,
- num_classes=len(class_names),
- max_length=max_length)
- if shuffle:
- # Shuffle locally at each iteration
- dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
- dataset = dataset.batch(batch_size)
- # Users may need to reference `class_names`.
- dataset.class_names = class_names
- return dataset
-
-
-def paths_and_labels_to_dataset(file_paths,
- labels,
- label_mode,
- num_classes,
- max_length):
- """Constructs a dataset of text strings and labels."""
- path_ds = dataset_ops.Dataset.from_tensor_slices(file_paths)
- string_ds = path_ds.map(
- lambda x: path_to_string_content(x, max_length))
- if label_mode:
- label_ds = dataset_utils.labels_to_dataset(labels, label_mode, num_classes)
- string_ds = dataset_ops.Dataset.zip((string_ds, label_ds))
- return string_ds
-
-
-def path_to_string_content(path, max_length):
- txt = io_ops.read_file(path)
- if max_length is not None:
- txt = string_ops.substr(txt, 0, max_length)
- return txt
diff --git a/tensorflow/python/keras/preprocessing/text_dataset_test.py b/tensorflow/python/keras/preprocessing/text_dataset_test.py
deleted file mode 100644
index a40364d..0000000
--- a/tensorflow/python/keras/preprocessing/text_dataset_test.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for text_dataset."""
-
-import os
-import random
-import shutil
-import string
-
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.preprocessing import text_dataset
-from tensorflow.python.platform import test
-
-
-class TextDatasetFromDirectoryTest(keras_parameterized.TestCase):
-
- def _prepare_directory(self,
- num_classes=2,
- nested_dirs=False,
- count=16,
- length=20):
- # Get a unique temp directory
- temp_dir = os.path.join(self.get_temp_dir(), str(random.randint(0, 1e6)))
- os.mkdir(temp_dir)
- self.addCleanup(shutil.rmtree, temp_dir)
-
- # Generate paths to class subdirectories
- paths = []
- for class_index in range(num_classes):
- class_directory = 'class_%s' % (class_index,)
- if nested_dirs:
- class_paths = [
- class_directory, os.path.join(class_directory, 'subfolder_1'),
- os.path.join(class_directory, 'subfolder_2'), os.path.join(
- class_directory, 'subfolder_1', 'sub-subfolder')
- ]
- else:
- class_paths = [class_directory]
- for path in class_paths:
- os.mkdir(os.path.join(temp_dir, path))
- paths += class_paths
-
- for i in range(count):
- path = paths[i % len(paths)]
- filename = os.path.join(path, 'text_%s.txt' % (i,))
- f = open(os.path.join(temp_dir, filename), 'w')
- text = ''.join([random.choice(string.printable) for _ in range(length)])
- f.write(text)
- f.close()
- return temp_dir
-
- def test_text_dataset_from_directory_standalone(self):
- # Test retrieving txt files without labels from a directory and its subdirs.
- # Save a few extra files in the parent directory.
- directory = self._prepare_directory(count=7, num_classes=2)
- for i in range(3):
- filename = 'text_%s.txt' % (i,)
- f = open(os.path.join(directory, filename), 'w')
- text = ''.join([random.choice(string.printable) for _ in range(20)])
- f.write(text)
- f.close()
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=5, label_mode=None, max_length=10)
- batch = next(iter(dataset))
- # We just return the texts, no labels
- self.assertEqual(batch.shape, (5,))
- self.assertEqual(batch.dtype.name, 'string')
- # Count samples
- batch_count = 0
- sample_count = 0
- for batch in dataset:
- batch_count += 1
- sample_count += batch.shape[0]
- self.assertEqual(batch_count, 2)
- self.assertEqual(sample_count, 10)
-
- def test_text_dataset_from_directory_binary(self):
- directory = self._prepare_directory(num_classes=2)
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode='int', max_length=10)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8,))
- self.assertEqual(batch[0].dtype.name, 'string')
- self.assertEqual(len(batch[0].numpy()[0]), 10) # Test max_length
- self.assertEqual(batch[1].shape, (8,))
- self.assertEqual(batch[1].dtype.name, 'int32')
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode='binary')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8,))
- self.assertEqual(batch[0].dtype.name, 'string')
- self.assertEqual(batch[1].shape, (8, 1))
- self.assertEqual(batch[1].dtype.name, 'float32')
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode='categorical')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8,))
- self.assertEqual(batch[0].dtype.name, 'string')
- self.assertEqual(batch[1].shape, (8, 2))
- self.assertEqual(batch[1].dtype.name, 'float32')
-
- def test_sample_count(self):
- directory = self._prepare_directory(num_classes=4, count=15)
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode=None)
- sample_count = 0
- for batch in dataset:
- sample_count += batch.shape[0]
- self.assertEqual(sample_count, 15)
-
- def test_text_dataset_from_directory_multiclass(self):
- directory = self._prepare_directory(num_classes=4, count=15)
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode=None)
- batch = next(iter(dataset))
- self.assertEqual(batch.shape, (8,))
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode=None)
- sample_count = 0
- iterator = iter(dataset)
- for batch in dataset:
- sample_count += next(iterator).shape[0]
- self.assertEqual(sample_count, 15)
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode='int')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8,))
- self.assertEqual(batch[0].dtype.name, 'string')
- self.assertEqual(batch[1].shape, (8,))
- self.assertEqual(batch[1].dtype.name, 'int32')
-
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode='categorical')
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8,))
- self.assertEqual(batch[0].dtype.name, 'string')
- self.assertEqual(batch[1].shape, (8, 4))
- self.assertEqual(batch[1].dtype.name, 'float32')
-
- def test_text_dataset_from_directory_validation_split(self):
- directory = self._prepare_directory(num_classes=2, count=10)
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=10, validation_split=0.2, subset='training',
- seed=1337)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (8,))
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=10, validation_split=0.2, subset='validation',
- seed=1337)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertEqual(batch[0].shape, (2,))
-
- def test_text_dataset_from_directory_manual_labels(self):
- directory = self._prepare_directory(num_classes=2, count=2)
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, labels=[0, 1], shuffle=False)
- batch = next(iter(dataset))
- self.assertLen(batch, 2)
- self.assertAllClose(batch[1], [0, 1])
-
- def test_text_dataset_from_directory_follow_links(self):
- directory = self._prepare_directory(num_classes=2, count=25,
- nested_dirs=True)
- dataset = text_dataset.text_dataset_from_directory(
- directory, batch_size=8, label_mode=None, follow_links=True)
- sample_count = 0
- for batch in dataset:
- sample_count += batch.shape[0]
- self.assertEqual(sample_count, 25)
-
- def test_text_dataset_from_directory_no_files(self):
- directory = self._prepare_directory(num_classes=2, count=0)
- with self.assertRaisesRegex(ValueError, 'No text files found.'):
- _ = text_dataset.text_dataset_from_directory(directory)
-
- def test_text_dataset_from_directory_errors(self):
- directory = self._prepare_directory(num_classes=3, count=5)
-
- with self.assertRaisesRegex(ValueError, '`labels` argument should be'):
- _ = text_dataset.text_dataset_from_directory(
- directory, labels='other')
-
- with self.assertRaisesRegex(ValueError, '`label_mode` argument must be'):
- _ = text_dataset.text_dataset_from_directory(
- directory, label_mode='other')
-
- with self.assertRaisesRegex(
- ValueError, 'only pass `class_names` if the labels are inferred'):
- _ = text_dataset.text_dataset_from_directory(
- directory, labels=[0, 0, 1, 1, 1],
- class_names=['class_0', 'class_1', 'class_2'])
-
- with self.assertRaisesRegex(
- ValueError,
- 'Expected the lengths of `labels` to match the number of files'):
- _ = text_dataset.text_dataset_from_directory(
- directory, labels=[0, 0, 1, 1])
-
- with self.assertRaisesRegex(
- ValueError, '`class_names` passed did not match'):
- _ = text_dataset.text_dataset_from_directory(
- directory, class_names=['class_0', 'class_2'])
-
- with self.assertRaisesRegex(ValueError, 'there must exactly 2 classes'):
- _ = text_dataset.text_dataset_from_directory(
- directory, label_mode='binary')
-
- with self.assertRaisesRegex(ValueError,
- '`validation_split` must be between 0 and 1'):
- _ = text_dataset.text_dataset_from_directory(
- directory, validation_split=2)
-
- with self.assertRaisesRegex(ValueError,
- '`subset` must be either "training" or'):
- _ = text_dataset.text_dataset_from_directory(
- directory, validation_split=0.2, subset='other')
-
- with self.assertRaisesRegex(ValueError, '`validation_split` must be set'):
- _ = text_dataset.text_dataset_from_directory(
- directory, validation_split=0, subset='training')
-
- with self.assertRaisesRegex(ValueError, 'must provide a `seed`'):
- _ = text_dataset.text_dataset_from_directory(
- directory, validation_split=0.2, subset='training')
-
-
-if __name__ == '__main__':
- v2_compat.enable_v2_behavior()
- test.main()
diff --git a/tensorflow/python/keras/preprocessing/text_test.py b/tensorflow/python/keras/preprocessing/text_test.py
deleted file mode 100644
index abe99d7..0000000
--- a/tensorflow/python/keras/preprocessing/text_test.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for text data preprocessing utils."""
-
-import numpy as np
-
-from tensorflow.python.keras.preprocessing import text as preprocessing_text
-from tensorflow.python.platform import test
-
-
-class TestText(test.TestCase):
-
- def test_one_hot(self):
- text = 'The cat sat on the mat.'
- encoded = preprocessing_text.one_hot(text, 5)
- self.assertEqual(len(encoded), 6)
- self.assertLessEqual(np.max(encoded), 4)
- self.assertGreaterEqual(np.min(encoded), 0)
-
- # Test on unicode.
- text = u'The cat sat on the mat.'
- encoded = preprocessing_text.one_hot(text, 5)
- self.assertEqual(len(encoded), 6)
- self.assertLessEqual(np.max(encoded), 4)
- self.assertGreaterEqual(np.min(encoded), 0)
-
- def test_tokenizer(self):
- texts = [
- 'The cat sat on the mat.',
- 'The dog sat on the log.',
- 'Dogs and cats living together.'
- ]
- tokenizer = preprocessing_text.Tokenizer(num_words=10)
- tokenizer.fit_on_texts(texts)
-
- sequences = []
- for seq in tokenizer.texts_to_sequences_generator(texts):
- sequences.append(seq)
- self.assertLess(np.max(np.max(sequences)), 10)
- self.assertEqual(np.min(np.min(sequences)), 1)
-
- tokenizer.fit_on_sequences(sequences)
-
- for mode in ['binary', 'count', 'tfidf', 'freq']:
- matrix = tokenizer.texts_to_matrix(texts, mode)
- self.assertEqual(matrix.shape, (3, 10))
-
- def test_hashing_trick_hash(self):
- text = 'The cat sat on the mat.'
- encoded = preprocessing_text.hashing_trick(text, 5)
- self.assertEqual(len(encoded), 6)
- self.assertLessEqual(np.max(encoded), 4)
- self.assertGreaterEqual(np.min(encoded), 1)
-
- def test_hashing_trick_md5(self):
- text = 'The cat sat on the mat.'
- encoded = preprocessing_text.hashing_trick(
- text, 5, hash_function='md5')
- self.assertEqual(len(encoded), 6)
- self.assertLessEqual(np.max(encoded), 4)
- self.assertGreaterEqual(np.min(encoded), 1)
-
- def test_tokenizer_oov_flag(self):
- x_train = ['This text has only known words']
- x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown
-
- # Default, without OOV flag
- tokenizer = preprocessing_text.Tokenizer()
- tokenizer.fit_on_texts(x_train)
- x_test_seq = tokenizer.texts_to_sequences(x_test)
- self.assertEqual(len(x_test_seq[0]), 4) # discards 2 OOVs
-
- # With OOV feature
- tokenizer = preprocessing_text.Tokenizer(oov_token='<unk>')
- tokenizer.fit_on_texts(x_train)
- x_test_seq = tokenizer.texts_to_sequences(x_test)
- self.assertEqual(len(x_test_seq[0]), 6) # OOVs marked in place
-
- def test_sequential_fit(self):
- texts = [
- 'The cat sat on the mat.', 'The dog sat on the log.',
- 'Dogs and cats living together.'
- ]
- word_sequences = [['The', 'cat', 'is', 'sitting'],
- ['The', 'dog', 'is', 'standing']]
- tokenizer = preprocessing_text.Tokenizer()
- tokenizer.fit_on_texts(texts)
- tokenizer.fit_on_texts(word_sequences)
-
- self.assertEqual(tokenizer.document_count, 5)
-
- tokenizer.texts_to_matrix(texts)
- tokenizer.texts_to_matrix(word_sequences)
-
- def test_text_to_word_sequence(self):
- text = 'hello! ? world!'
- seq = preprocessing_text.text_to_word_sequence(text)
- self.assertEqual(seq, ['hello', 'world'])
-
- def test_text_to_word_sequence_multichar_split(self):
- text = 'hello!stop?world!'
- seq = preprocessing_text.text_to_word_sequence(text, split='stop')
- self.assertEqual(seq, ['hello', 'world'])
-
- def test_text_to_word_sequence_unicode(self):
- text = u'ali! veli? kırk dokuz elli'
- seq = preprocessing_text.text_to_word_sequence(text)
- self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
-
- def test_text_to_word_sequence_unicode_multichar_split(self):
- text = u'ali!stopveli?stopkırkstopdokuzstopelli'
- seq = preprocessing_text.text_to_word_sequence(text, split='stop')
- self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
-
- def test_tokenizer_unicode(self):
- texts = [
- u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
- ]
- tokenizer = preprocessing_text.Tokenizer(num_words=5)
- tokenizer.fit_on_texts(texts)
-
- self.assertEqual(len(tokenizer.word_counts), 5)
-
-
-if __name__ == '__main__':
- test.main()
diff --git a/tensorflow/python/keras/preprocessing/timeseries.py b/tensorflow/python/keras/preprocessing/timeseries.py
deleted file mode 100644
index abe0418..0000000
--- a/tensorflow/python/keras/preprocessing/timeseries.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras timeseries dataset utilities."""
-# pylint: disable=g-classes-have-attributes
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import keras_export
-
-
-@keras_export('keras.utils.timeseries_dataset_from_array',
- 'keras.preprocessing.timeseries_dataset_from_array',
- v1=[])
-def timeseries_dataset_from_array(
- data,
- targets,
- sequence_length,
- sequence_stride=1,
- sampling_rate=1,
- batch_size=128,
- shuffle=False,
- seed=None,
- start_index=None,
- end_index=None):
- """Creates a dataset of sliding windows over a timeseries provided as array.
-
- This function takes in a sequence of data-points gathered at
- equal intervals, along with time series parameters such as
- length of the sequences/windows, spacing between two sequence/windows, etc.,
- to produce batches of timeseries inputs and targets.
-
- Args:
- data: Numpy array or eager tensor
- containing consecutive data points (timesteps).
- Axis 0 is expected to be the time dimension.
- targets: Targets corresponding to timesteps in `data`.
- `targets[i]` should be the target
- corresponding to the window that starts at index `i`
- (see example 2 below).
- Pass None if you don't have target data (in this case the dataset will
- only yield the input data).
- sequence_length: Length of the output sequences (in number of timesteps).
- sequence_stride: Period between successive output sequences.
- For stride `s`, output samples would
- start at index `data[i]`, `data[i + s]`, `data[i + 2 * s]`, etc.
- sampling_rate: Period between successive individual timesteps
- within sequences. For rate `r`, timesteps
- `data[i], data[i + r], ... data[i + sequence_length]`
- are used for create a sample sequence.
- batch_size: Number of timeseries samples in each batch
- (except maybe the last one).
- shuffle: Whether to shuffle output samples,
- or instead draw them in chronological order.
- seed: Optional int; random seed for shuffling.
- start_index: Optional int; data points earlier (exclusive)
- than `start_index` will not be used
- in the output sequences. This is useful to reserve part of the
- data for test or validation.
- end_index: Optional int; data points later (exclusive) than `end_index`
- will not be used in the output sequences.
- This is useful to reserve part of the data for test or validation.
-
- Returns:
- A tf.data.Dataset instance. If `targets` was passed, the dataset yields
- tuple `(batch_of_sequences, batch_of_targets)`. If not, the dataset yields
- only `batch_of_sequences`.
-
- Example 1:
-
- Consider indices `[0, 1, ... 99]`.
- With `sequence_length=10, sampling_rate=2, sequence_stride=3`,
- `shuffle=False`, the dataset will yield batches of sequences
- composed of the following indices:
-
- ```
- First sequence: [0 2 4 6 8 10 12 14 16 18]
- Second sequence: [3 5 7 9 11 13 15 17 19 21]
- Third sequence: [6 8 10 12 14 16 18 20 22 24]
- ...
- Last sequence: [78 80 82 84 86 88 90 92 94 96]
- ```
-
- In this case the last 3 data points are discarded since no full sequence
- can be generated to include them (the next sequence would have started
- at index 81, and thus its last step would have gone over 99).
-
- Example 2: Temporal regression.
-
- Consider an array `data` of scalar values, of shape `(steps,)`.
- To generate a dataset that uses the past 10
- timesteps to predict the next timestep, you would use:
-
- ```python
- input_data = data[:-10]
- targets = data[10:]
- dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
- input_data, targets, sequence_length=10)
- for batch in dataset:
- inputs, targets = batch
- assert np.array_equal(inputs[0], data[:10]) # First sequence: steps [0-9]
- assert np.array_equal(targets[0], data[10]) # Corresponding target: step 10
- break
- ```
-
- Example 3: Temporal regression for many-to-many architectures.
-
- Consider two arrays of scalar values `X` and `Y`,
- both of shape `(100,)`. The resulting dataset should consist samples with
- 20 timestamps each. The samples should not overlap.
- To generate a dataset that uses the current timestamp
- to predict the corresponding target timestep, you would use:
-
- ```python
- X = np.arange(100)
- Y = X*2
-
- sample_length = 20
- input_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
- X, None, sequence_length=sample_length, sequence_stride=sample_length)
- target_dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
- Y, None, sequence_length=sample_length, sequence_stride=sample_length)
-
- for batch in zip(input_dataset, target_dataset):
- inputs, targets = batch
- assert np.array_equal(inputs[0], X[:sample_length])
-
- # second sample equals output timestamps 20-40
- assert np.array_equal(targets[1], Y[sample_length:2*sample_length])
- break
- ```
- """
- if start_index and (start_index < 0 or start_index >= len(data)):
- raise ValueError('start_index must be higher than 0 and lower than the '
- 'length of the data. Got: start_index=%s '
- 'for data of length %s.' % (start_index, len(data)))
- if end_index:
- if start_index and end_index <= start_index:
- raise ValueError('end_index must be higher than start_index. Got: '
- 'start_index=%s, end_index=%s.' %
- (start_index, end_index))
- if end_index >= len(data):
- raise ValueError('end_index must be lower than the length of the data. '
- 'Got: end_index=%s' % (end_index,))
- if end_index <= 0:
- raise ValueError('end_index must be higher than 0. '
- 'Got: end_index=%s' % (end_index,))
-
- # Validate strides
- if sampling_rate <= 0 or sampling_rate >= len(data):
- raise ValueError(
- 'sampling_rate must be higher than 0 and lower than '
- 'the length of the data. Got: '
- 'sampling_rate=%s for data of length %s.' % (sampling_rate, len(data)))
- if sequence_stride <= 0 or sequence_stride >= len(data):
- raise ValueError(
- 'sequence_stride must be higher than 0 and lower than '
- 'the length of the data. Got: sequence_stride=%s '
- 'for data of length %s.' % (sequence_stride, len(data)))
-
- if start_index is None:
- start_index = 0
- if end_index is None:
- end_index = len(data)
-
- # Determine the lowest dtype to store start positions (to lower memory usage).
- num_seqs = end_index - start_index - (sequence_length * sampling_rate) + 1
- if targets is not None:
- num_seqs = min(num_seqs, len(targets))
- if num_seqs < 2147483647:
- index_dtype = 'int32'
- else:
- index_dtype = 'int64'
-
- # Generate start positions
- start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
- if shuffle:
- if seed is None:
- seed = np.random.randint(1e6)
- rng = np.random.RandomState(seed)
- rng.shuffle(start_positions)
-
- sequence_length = math_ops.cast(sequence_length, dtype=index_dtype)
- sampling_rate = math_ops.cast(sampling_rate, dtype=index_dtype)
-
- positions_ds = dataset_ops.Dataset.from_tensors(start_positions).repeat()
-
- # For each initial window position, generates indices of the window elements
- indices = dataset_ops.Dataset.zip(
- (dataset_ops.Dataset.range(len(start_positions)), positions_ds)).map(
- lambda i, positions: math_ops.range( # pylint: disable=g-long-lambda
- positions[i],
- positions[i] + sequence_length * sampling_rate,
- sampling_rate),
- num_parallel_calls=dataset_ops.AUTOTUNE)
-
- dataset = sequences_from_indices(data, indices, start_index, end_index)
- if targets is not None:
- indices = dataset_ops.Dataset.zip(
- (dataset_ops.Dataset.range(len(start_positions)), positions_ds)).map(
- lambda i, positions: positions[i],
- num_parallel_calls=dataset_ops.AUTOTUNE)
- target_ds = sequences_from_indices(
- targets, indices, start_index, end_index)
- dataset = dataset_ops.Dataset.zip((dataset, target_ds))
- if shuffle:
- # Shuffle locally at each iteration
- dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
- dataset = dataset.batch(batch_size)
- return dataset
-
-
-def sequences_from_indices(array, indices_ds, start_index, end_index):
- dataset = dataset_ops.Dataset.from_tensors(array[start_index : end_index])
- dataset = dataset_ops.Dataset.zip((dataset.repeat(), indices_ds)).map(
- lambda steps, inds: array_ops.gather(steps, inds), # pylint: disable=unnecessary-lambda
- num_parallel_calls=dataset_ops.AUTOTUNE)
- return dataset
diff --git a/tensorflow/python/keras/preprocessing/timeseries_test.py b/tensorflow/python/keras/preprocessing/timeseries_test.py
deleted file mode 100644
index 1099ed9..0000000
--- a/tensorflow/python/keras/preprocessing/timeseries_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for timeseries."""
-
-import numpy as np
-
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.keras.preprocessing import timeseries
-from tensorflow.python.platform import test
-
-
-class TimeseriesDatasetTest(test.TestCase):
-
- def test_basics(self):
- # Test ordering, targets, sequence length, batch size
- data = np.arange(100)
- targets = data * 2
- dataset = timeseries.timeseries_dataset_from_array(
- data, targets, sequence_length=9, batch_size=5)
- # Expect 19 batches
- for i, batch in enumerate(dataset):
- self.assertLen(batch, 2)
- inputs, targets = batch
- if i < 18:
- self.assertEqual(inputs.shape, (5, 9))
- if i == 18:
- # Last batch: size 2
- self.assertEqual(inputs.shape, (2, 9))
- # Check target values
- self.assertAllClose(targets, inputs[:, 0] * 2)
- for j in range(min(5, len(inputs))):
- # Check each sample in the batch
- self.assertAllClose(inputs[j], np.arange(i * 5 + j, i * 5 + j + 9))
-
- def test_timeseries_regression(self):
- # Test simple timeseries regression use case
- data = np.arange(10)
- offset = 3
- targets = data[offset:]
- dataset = timeseries.timeseries_dataset_from_array(
- data, targets, sequence_length=offset, batch_size=1)
- i = 0
- for batch in dataset:
- self.assertLen(batch, 2)
- inputs, targets = batch
- self.assertEqual(inputs.shape, (1, 3))
- # Check values
- self.assertAllClose(targets[0], data[offset + i])
- self.assertAllClose(inputs[0], data[i : i + offset])
- i += 1
- self.assertEqual(i, 7) # Expect 7 batches
-
- def test_no_targets(self):
- data = np.arange(50)
- dataset = timeseries.timeseries_dataset_from_array(
- data, None, sequence_length=10, batch_size=5)
- # Expect 9 batches
- i = None
- for i, batch in enumerate(dataset):
- if i < 8:
- self.assertEqual(batch.shape, (5, 10))
- elif i == 8:
- self.assertEqual(batch.shape, (1, 10))
- for j in range(min(5, len(batch))):
- # Check each sample in the batch
- self.assertAllClose(batch[j], np.arange(i * 5 + j, i * 5 + j + 10))
- self.assertEqual(i, 8)
-
- def test_shuffle(self):
- # Test cross-epoch random order and seed determinism
- data = np.arange(10)
- targets = data * 2
- dataset = timeseries.timeseries_dataset_from_array(
- data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123)
- first_seq = None
- for x, y in dataset.take(1):
- self.assertNotAllClose(x, np.arange(0, 5))
- self.assertAllClose(x[:, 0] * 2, y)
- first_seq = x
- # Check that a new iteration with the same dataset yields different results
- for x, _ in dataset.take(1):
- self.assertNotAllClose(x, first_seq)
- # Check determism with same seed
- dataset = timeseries.timeseries_dataset_from_array(
- data, targets, sequence_length=5, batch_size=1, shuffle=True, seed=123)
- for x, _ in dataset.take(1):
- self.assertAllClose(x, first_seq)
-
- def test_sampling_rate(self):
- data = np.arange(100)
- targets = data * 2
- dataset = timeseries.timeseries_dataset_from_array(
- data, targets, sequence_length=9, batch_size=5, sampling_rate=2)
- for i, batch in enumerate(dataset):
- self.assertLen(batch, 2)
- inputs, targets = batch
- if i < 16:
- self.assertEqual(inputs.shape, (5, 9))
- if i == 16:
- # Last batch: size 3
- self.assertEqual(inputs.shape, (3, 9))
- # Check target values
- self.assertAllClose(inputs[:, 0] * 2, targets)
- for j in range(min(5, len(inputs))):
- # Check each sample in the batch
- start_index = i * 5 + j
- end_index = start_index + 9 * 2
- self.assertAllClose(inputs[j], np.arange(start_index, end_index, 2))
-
- def test_sequence_stride(self):
- data = np.arange(100)
- targets = data * 2
- dataset = timeseries.timeseries_dataset_from_array(
- data, targets, sequence_length=9, batch_size=5, sequence_stride=3)
- for i, batch in enumerate(dataset):
- self.assertLen(batch, 2)
- inputs, targets = batch
- if i < 6:
- self.assertEqual(inputs.shape, (5, 9))
- if i == 6:
- # Last batch: size 1
- self.assertEqual(inputs.shape, (1, 9))
- # Check target values
- self.assertAllClose(inputs[:, 0] * 2, targets)
- for j in range(min(5, len(inputs))):
- # Check each sample in the batch
- start_index = i * 5 * 3 + j * 3
- end_index = start_index + 9
- self.assertAllClose(inputs[j],
- np.arange(start_index, end_index))
-
- def test_start_and_end_index(self):
- data = np.arange(100)
- dataset = timeseries.timeseries_dataset_from_array(
- data, None,
- sequence_length=9, batch_size=5, sequence_stride=3, sampling_rate=2,
- start_index=10, end_index=90)
- for batch in dataset:
- self.assertAllLess(batch[0], 90)
- self.assertAllGreater(batch[0], 9)
-
- def test_errors(self):
- # bad start index
- with self.assertRaisesRegex(ValueError, 'start_index must be '):
- _ = timeseries.timeseries_dataset_from_array(
- np.arange(10), None, 3, start_index=-1)
- with self.assertRaisesRegex(ValueError, 'start_index must be '):
- _ = timeseries.timeseries_dataset_from_array(
- np.arange(10), None, 3, start_index=11)
- # bad end index
- with self.assertRaisesRegex(ValueError, 'end_index must be '):
- _ = timeseries.timeseries_dataset_from_array(
- np.arange(10), None, 3, end_index=-1)
- with self.assertRaisesRegex(ValueError, 'end_index must be '):
- _ = timeseries.timeseries_dataset_from_array(
- np.arange(10), None, 3, end_index=11)
- # bad sampling_rate
- with self.assertRaisesRegex(ValueError, 'sampling_rate must be '):
- _ = timeseries.timeseries_dataset_from_array(
- np.arange(10), None, 3, sampling_rate=0)
- # bad sequence stride
- with self.assertRaisesRegex(ValueError, 'sequence_stride must be '):
- _ = timeseries.timeseries_dataset_from_array(
- np.arange(10), None, 3, sequence_stride=0)
-
-
-if __name__ == '__main__':
- v2_compat.enable_v2_behavior()
- test.main()
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 2df988d..af9568f 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -56,16 +56,6 @@
)
py_library(
- name = "kpl_test_utils",
- srcs = ["kpl_test_utils.py"],
- srcs_version = "PY3",
- deps = [
- "//tensorflow/python/keras",
- "//tensorflow/python/keras/layers/preprocessing:string_lookup",
- ],
-)
-
-py_library(
name = "data_utils",
srcs = ["data_utils.py"],
srcs_version = "PY3",
diff --git a/tensorflow/python/keras/utils/kpl_test_utils.py b/tensorflow/python/keras/utils/kpl_test_utils.py
deleted file mode 100644
index 57696a7..0000000
--- a/tensorflow/python/keras/utils/kpl_test_utils.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test related utilities for KPL + tf.distribute."""
-
-import random
-import tempfile
-
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras.layers.preprocessing import string_lookup
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class DistributeKplTestUtils(test.TestCase):
- """Utils for test of tf.distribute + KPL."""
- FEATURE_VOCAB = [
- "avenger", "ironman", "batman", "hulk", "spiderman", "kingkong",
- "wonder_woman"
- ]
- LABEL_VOCAB = ["yes", "no"]
-
- def define_kpls_for_training(self, use_adapt):
- """Function that defines KPL used for unit tests of tf.distribute.
-
- Args:
- use_adapt: if adapt will be called. False means there will be precomputed
- statistics.
-
- Returns:
- feature_mapper: a simple keras model with one keras StringLookup layer
- which maps feature to index.
- label_mapper: similar to feature_mapper, but maps label to index.
-
- """
- if use_adapt:
- feature_lookup_layer = (
- string_lookup.StringLookup(
- num_oov_indices=1))
- feature_lookup_layer.adapt(self.FEATURE_VOCAB)
- label_lookup_layer = (
- string_lookup.StringLookup(
- num_oov_indices=0, mask_token=None))
- label_lookup_layer.adapt(self.LABEL_VOCAB)
- else:
- feature_lookup_layer = (
- string_lookup.StringLookup(
- vocabulary=self.FEATURE_VOCAB, num_oov_indices=1))
- label_lookup_layer = (
- string_lookup.StringLookup(
- vocabulary=self.LABEL_VOCAB, num_oov_indices=0, mask_token=None))
-
- raw_feature_input = keras.layers.Input(
- shape=(3,), dtype=dtypes.string, name="feature", ragged=True)
- feature_id_input = feature_lookup_layer(raw_feature_input)
- feature_mapper = keras.Model({"features": raw_feature_input},
- feature_id_input)
-
- raw_label_input = keras.layers.Input(
- shape=(1,), dtype=dtypes.string, name="label")
- label_id_input = label_lookup_layer(raw_label_input)
- label_mapper = keras.Model({"label": raw_label_input}, label_id_input)
-
- return feature_mapper, label_mapper
-
- def dataset_fn(self, feature_mapper, label_mapper):
- """Function that generates dataset for test of tf.distribute + KPL.
-
- Args:
- feature_mapper: a simple keras model with one keras StringLookup layer
- which maps feature to index.
- label_mapper: similar to feature_mapper, but maps label to index.
-
- Returns:
- Generated dataset for test of tf.distribute + KPL.
-
- """
-
- def feature_and_label_gen():
- # Generator of dataset.
- while True:
- features = random.sample(self.FEATURE_VOCAB, 3)
- label = ["yes"] if self.FEATURE_VOCAB[0] in features else ["no"]
- yield {"features": features, "label": label}
-
- raw_dataset = dataset_ops.Dataset.from_generator(
- feature_and_label_gen,
- output_signature={
- "features": tensor_spec.TensorSpec([3], dtypes.string),
- "label": tensor_spec.TensorSpec([1], dtypes.string)
- }).shuffle(100).batch(32)
-
- train_dataset = raw_dataset.map(lambda x: ( # pylint: disable=g-long-lambda
- {
- "features": feature_mapper(x["features"])
- }, label_mapper(x["label"])))
- return train_dataset
-
- def define_model(self):
- """A simple model for test of tf.distribute + KPL."""
- # Create the model. The input needs to be compatible with KPLs.
- model_input = keras.layers.Input(
- shape=(3,), dtype=dtypes.int64, name="model_input")
-
- # input_dim includes a mask token and an oov token.
- emb_output = keras.layers.Embedding(
- input_dim=len(self.FEATURE_VOCAB) + 2, output_dim=20)(
- model_input)
- emb_output = math_ops.reduce_mean(emb_output, axis=1)
- dense_output = keras.layers.Dense(
- units=1, activation="sigmoid")(
- emb_output)
- model = keras.Model({"features": model_input}, dense_output)
- return model
-
- def define_reverse_lookup_layer(self):
- """Create string reverse lookup layer for serving."""
-
- label_inverse_lookup_layer = string_lookup.StringLookup(
- num_oov_indices=0,
- mask_token=None,
- vocabulary=self.LABEL_VOCAB,
- invert=True)
- return label_inverse_lookup_layer
-
- def create_serving_signature(self, model, feature_mapper,
- label_inverse_lookup_layer):
- """Create serving signature for the given model."""
-
- @def_function.function
- def serve_fn(raw_features):
- raw_features = array_ops.expand_dims(raw_features, axis=0)
- transformed_features = model.feature_mapper(raw_features)
- outputs = model(transformed_features)
- outputs = array_ops.squeeze(outputs, axis=0)
- outputs = math_ops.cast(math_ops.greater(outputs, 0.5), dtypes.int64)
- decoded_outputs = model.label_inverse_lookup_layer(outputs)
- return array_ops.squeeze(decoded_outputs, axis=0)
-
- model.feature_mapper = feature_mapper
- model.label_inverse_lookup_layer = label_inverse_lookup_layer
- # serving does NOT have batch dimension
- return serve_fn.get_concrete_function(
- tensor_spec.TensorSpec(
- shape=(3), dtype=dtypes.string, name="example"))
-
- def test_save_load_serving_model(self, model, feature_mapper,
- label_inverse_lookup_layer):
- """Test save/load/serving model."""
-
- serving_fn = self.create_serving_signature(model, feature_mapper,
- label_inverse_lookup_layer)
-
- saved_model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
- model.save(saved_model_dir, save_format="tf",
- signatures={"serving_default": serving_fn})
-
- # Test the saved_model.
- loaded_serving_fn = keras.saving.save.load_model(
- saved_model_dir).signatures["serving_default"]
-
- # check the result w/ and w/o avenger.
- prediction0 = loaded_serving_fn(
- constant_op.constant(["avenger", "ironman", "avenger"]))["output_0"]
- self.assertIn(prediction0.numpy().decode("UTF-8"), ("yes", "no"))
-
- prediction1 = loaded_serving_fn(
- constant_op.constant(["ironman", "ironman", "unkonwn"]))["output_0"]
- self.assertIn(prediction1.numpy().decode("UTF-8"), ("yes", "no"))
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 0707e39..f63cbbc 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -129,12 +129,10 @@
"//tensorflow/python/distribute:multi_process_runner",
"//tensorflow/python/eager:eager_pip",
"//tensorflow/python/keras:combinations",
- "//tensorflow/python/keras/layers/preprocessing:preprocessing_test_utils",
"//tensorflow/python/keras/distribute:distribute_test_lib_pip",
"//tensorflow/python/keras/distribute:strategy_combinations",
"//tensorflow/python/keras/mixed_precision:test_util",
"//tensorflow/python/keras/utils:dataset_creator",
- "//tensorflow/python/keras/utils:kpl_test_utils",
"//tensorflow/python/kernel_tests:cudnn_deterministic_base",
"//tensorflow/python/kernel_tests:bias_op_base",
"//tensorflow/python/kernel_tests:sparse_xent_op_test_base",