blob: 0f43f4df7043d54e42c39d8a15dfa13675da3f22 [file] [log] [blame]
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the `SnapshotDataset` transformation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl.testing import parameterized
from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
from tensorflow.python.data.experimental.ops import snapshot
from tensorflow.python.data.kernel_tests import test_base
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.data.ops import readers as core_readers
from tensorflow.python.framework import combinations
from tensorflow.python.ops import gen_array_ops
from tensorflow.python.ops import string_ops
from tensorflow.python.platform import test
class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
parameterized.TestCase):
def setUp(self):
super(SnapshotDatasetTest, self).setUp()
self.removeTFRecords()
def removeTFRecords(self):
for filename in self.test_filenames:
os.remove(filename)
self.test_filenames = []
def setUpTFRecord(self, num_files=10, num_records=10):
self._num_files = num_files
self._num_records = num_records
self.test_filenames = self._createFiles()
def makeSnapshotDirectory(self):
tmpdir = self.get_temp_dir()
tmpdir = os.path.join(tmpdir, "snapshot")
os.mkdir(tmpdir)
return tmpdir
def assertSnapshotDirectoryContains(
self, directory, num_fingerprints, num_runs_per_fp, num_snapshot_files):
dirlist = os.listdir(directory)
self.assertLen(dirlist, num_fingerprints)
for i in range(num_fingerprints):
fingerprint_dir = os.path.join(directory, dirlist[i])
fingerprint_dir_list = sorted(os.listdir(fingerprint_dir))
self.assertLen(fingerprint_dir_list, num_runs_per_fp + 1)
self.assertEqual(fingerprint_dir_list[num_runs_per_fp],
"snapshot.metadata")
for j in range(num_runs_per_fp):
run_dir = os.path.join(fingerprint_dir, fingerprint_dir_list[j])
run_dirlist = sorted(os.listdir(run_dir))
self.assertLen(run_dirlist, num_snapshot_files)
file_counter = 0
for filename in run_dirlist:
self.assertEqual(filename, "%08d.snapshot" % file_counter)
file_counter += 1
@combinations.generate(test_base.default_test_combinations())
def testWriteDifferentPipelinesInOneDirectory(self):
tmpdir = self.makeSnapshotDirectory()
dataset = dataset_ops.Dataset.range(1000)
dataset = dataset.apply(snapshot.snapshot(tmpdir))
self.assertDatasetProduces(dataset, list(range(1000)))
dataset = dataset_ops.Dataset.range(1001)
dataset = dataset.apply(snapshot.snapshot(tmpdir))
self.assertDatasetProduces(dataset, list(range(1001)))
self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
@combinations.generate(test_base.default_test_combinations())
def testWriteSnapshotMultipleSimultaneous(self):
tmpdir = self.makeSnapshotDirectory()
dataset1 = dataset_ops.Dataset.range(1000)
dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
next1 = self.getNext(dataset1)
dataset2 = dataset_ops.Dataset.range(1000)
dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
next2 = self.getNext(dataset2)
for _ in range(1000):
self.evaluate(next1())
self.evaluate(next2())
# we check that only one copy of the metadata has been written, and the
# one that lost the race would be in passthrough mode.
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@combinations.generate(test_base.default_test_combinations())
def testGetNextCreatesDir(self):
tmpdir = self.makeSnapshotDirectory()
# We create two iterators but call getNext on only one.
dataset1 = dataset_ops.Dataset.range(1000)
dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
next1 = self.getNext(dataset1)
dataset2 = dataset_ops.Dataset.range(1001)
dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
_ = self.getNext(dataset2)
for _ in range(1000):
self.evaluate(next1())
# We check that only one directory is created.
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@combinations.generate(
combinations.times(
test_base.default_test_combinations(),
combinations.combine(
compression=[snapshot.COMPRESSION_NONE,
snapshot.COMPRESSION_GZIP])))
def testWriteSnapshotSimpleSuccessful(self, compression):
tmpdir = self.makeSnapshotDirectory()
dataset = dataset_ops.Dataset.range(1000)
dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
self.assertDatasetProduces(dataset, list(range(1000)))
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@combinations.generate(test_base.default_test_combinations())
def testWriteSnapshotRepeatAfterwards(self):
tmpdir = self.makeSnapshotDirectory()
dataset = dataset_ops.Dataset.range(10)
dataset = dataset.apply(snapshot.snapshot(tmpdir))
dataset = dataset.repeat(10)
self.assertDatasetProduces(dataset, list(range(10)) * 10)
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@combinations.generate(
combinations.times(
test_base.default_test_combinations(),
combinations.combine(
compression=[snapshot.COMPRESSION_NONE,
snapshot.COMPRESSION_GZIP])))
def testReadSnapshotBackAfterWrite(self, compression):
self.setUpTFRecord()
filenames = self.test_filenames
expected = [
b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension
for f in range(0, 10)
for r in range(0, 10)
]
tmpdir = self.makeSnapshotDirectory()
dataset = core_readers._TFRecordDataset(filenames)
dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
self.assertDatasetProduces(dataset, expected)
# remove the original files and try to read the data back only from snapshot
self.removeTFRecords()
dataset2 = core_readers._TFRecordDataset(filenames)
dataset2 = dataset2.apply(snapshot.snapshot(
tmpdir, compression=compression))
self.assertDatasetProduces(dataset2, expected)
@combinations.generate(test_base.default_test_combinations())
def testReadSnapshotParallelAfterWrite(self):
self.setUpTFRecord(10, 4000)
filenames = self.test_filenames
expected = [
b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension
for f in range(0, 10)
for r in range(0, 4000)
]
tmpdir = self.makeSnapshotDirectory()
dataset = core_readers._TFRecordDataset(filenames)
dataset = dataset.apply(
snapshot.snapshot(
tmpdir,
shard_size_bytes=1024 * 1024,
num_reader_threads=2,
reader_buffer_size=10))
self.assertDatasetProduces(dataset, expected, assert_items_equal=True)
# remove the original files and try to read the data back only from
# snapshot.
self.removeTFRecords()
dataset2 = core_readers._TFRecordDataset(filenames)
dataset2 = dataset2.apply(
snapshot.snapshot(
tmpdir,
shard_size_bytes=1024 * 1024,
num_reader_threads=2,
reader_buffer_size=10))
self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
@combinations.generate(
combinations.times(
test_base.default_test_combinations(),
combinations.times(
combinations.combine(
compression=[snapshot.COMPRESSION_NONE,
snapshot.COMPRESSION_GZIP]),
combinations.combine(threads=2, size=[1, 2]) +
combinations.combine(threads=8, size=[1, 4, 8]))))
def testReadSnapshotBackAfterMultiThreadedWrite(
self, compression, threads, size):
self.setUpTFRecord()
filenames = self.test_filenames
expected = [
b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension
for f in range(0, 10)
for r in range(0, 10)
]
tmpdir = self.makeSnapshotDirectory()
dataset = core_readers._TFRecordDataset(filenames)
dataset = dataset.apply(
snapshot.snapshot(
tmpdir,
compression=compression,
num_writer_threads=threads,
writer_buffer_size=size))
self.assertDatasetProduces(dataset, expected)
# remove the original files and try to read the data back only from
# snapshot
self.removeTFRecords()
dataset2 = core_readers._TFRecordDataset(filenames)
dataset2 = dataset2.apply(
snapshot.snapshot(tmpdir, compression=compression))
self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
@combinations.generate(test_base.default_test_combinations())
def testSameFingerprintWithDifferentInitializationOrder(self):
tmpdir = self.makeSnapshotDirectory()
dataset1 = dataset_ops.Dataset.range(0, 100)
dataset2 = dataset_ops.Dataset.range(100, 200)
dataset3 = dataset_ops.Dataset.range(200, 300)
dataset = dataset1.concatenate(dataset2).concatenate(dataset3)
dataset = dataset.apply(snapshot.snapshot(tmpdir))
self.assertDatasetProduces(dataset, list(range(300)))
dataset4 = dataset_ops.Dataset.range(200, 300)
dataset5 = dataset_ops.Dataset.range(100, 200)
dataset6 = dataset_ops.Dataset.range(0, 100)
dataset = dataset6.concatenate(dataset5).concatenate(dataset4)
dataset = dataset.apply(snapshot.snapshot(tmpdir))
self.assertDatasetProduces(dataset, list(range(300)))
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@combinations.generate(test_base.default_test_combinations())
def testExpiredSnapshotRewrite(self):
tmpdir = self.makeSnapshotDirectory()
dataset1 = dataset_ops.Dataset.range(1000)
dataset1 = dataset1.apply(
snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
next1 = self.getNext(dataset1)
# Don't finish reading dataset1, so it is never finalized
for _ in range(500):
self.evaluate(next1())
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
time.sleep(2)
# Creating dataset2 after we run through dataset1 due to eager mode, where
# the snapshot state is determined immediately upon dataset creation. We
# only want to determine the snapshot state for dataset2 after the first
# snapshot has expired.
dataset2 = dataset_ops.Dataset.range(1000)
dataset2 = dataset2.apply(
snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
next2 = self.getNext(dataset2)
for _ in range(500):
self.evaluate(next2())
self.assertSnapshotDirectoryContains(tmpdir, 1, 2, 1)
@combinations.generate(test_base.default_test_combinations())
def testSpecifyShardSize(self):
tmpdir = self.makeSnapshotDirectory()
dataset = dataset_ops.Dataset.from_tensor_slices([1.0])
dataset = dataset.map(lambda x: gen_array_ops.broadcast_to(x, [1024, 1024]))
dataset = dataset.repeat(10)
dataset = dataset.apply(
snapshot.snapshot(tmpdir, shard_size_bytes=10 * 1024 * 1024))
next_fn = self.getNext(dataset)
for _ in range(10):
self.evaluate(next_fn())
self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 4)
@combinations.generate(test_base.default_test_combinations())
def testAdditionalOperationsAfterReadBack(self):
self.setUpTFRecord()
filenames = self.test_filenames
expected = [
b"Record %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension
for f in range(0, 10)
for r in range(0, 10)
]
tmpdir = self.makeSnapshotDirectory()
dataset = core_readers._TFRecordDataset(filenames)
dataset = dataset.apply(snapshot.snapshot(tmpdir))
self.assertDatasetProduces(dataset, expected)
# remove the original files and try to read the data back only from snapshot
self.removeTFRecords()
dataset2 = core_readers._TFRecordDataset(filenames)
dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
self.assertDatasetProduces(dataset2, expected)
expected_after = [
b"cord %d of file %d" % (r, f) # pylint:disable=g-complex-comprehension
for f in range(0, 10)
for r in range(0, 10)
]
dataset3 = core_readers._TFRecordDataset(filenames)
dataset3 = dataset3.apply(snapshot.snapshot(tmpdir))
dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000))
self.assertDatasetProduces(dataset3, expected_after)
if __name__ == "__main__":
test.main()