aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/python/lib
diff options
context:
space:
mode:
authorGravatar Karmel Allison <karmel@google.com>2018-05-23 20:53:15 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-23 20:56:01 -0700
commit81ef70a0bc22163d34f1e0425122d6a93bf02eac (patch)
treec54ca4015958c17fc45573f1a553ff8d7907f451 /tensorflow/python/lib
parent8f863f3d71542c47390f2d40348b72296ed5c4be (diff)
Resolve name collisions with assets in SavedModels by deduplicating names that
point to distinct files. PiperOrigin-RevId: 197835288
Diffstat (limited to 'tensorflow/python/lib')
-rw-r--r--tensorflow/python/lib/io/file_io.py58
-rw-r--r--tensorflow/python/lib/io/file_io_test.py91
2 files changed, 149 insertions, 0 deletions
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 59f5075f17..f22fb253e4 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -21,6 +21,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
+import binascii
import os
import uuid
@@ -33,6 +34,10 @@ from tensorflow.python.util import compat
from tensorflow.python.util import deprecation
from tensorflow.python.util.tf_export import tf_export
+# A good default block size depends on the system in question.
+# A somewhat conservative default chosen here.
+_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024
+
class FileIO(object):
"""FileIO class that exposes methods to read / write to / from files.
@@ -551,3 +556,56 @@ def stat(filename):
with errors.raise_exception_on_not_ok_status() as status:
pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
return file_statistics
+
+
+def filecmp(filename_a, filename_b):
+ """Compare two files, returning True if they are the same, False otherwise.
+
+ We check size first and return False quickly if the files are different sizes.
+ If they are the same size, we continue to generating a crc for the whole file.
+
+ You might wonder: why not use Python's filecmp.cmp() instead? The answer is
+ that the builtin library is not robust to the many different filesystems
+ TensorFlow runs on, and so we here perform a similar comparison with
+ the more robust FileIO.
+
+ Args:
+ filename_a: string path to the first file.
+ filename_b: string path to the second file.
+
+ Returns:
+ True if the files are the same, False otherwise.
+ """
+ size_a = FileIO(filename_a, "rb").size()
+ size_b = FileIO(filename_b, "rb").size()
+ if size_a != size_b:
+ return False
+
+ # Size is the same. Do a full check.
+ crc_a = file_crc32(filename_a)
+ crc_b = file_crc32(filename_b)
+ return crc_a == crc_b
+
+
+def file_crc32(filename, block_size=_DEFAULT_BLOCK_SIZE):
+ """Get the crc32 of the passed file.
+
+ The crc32 of a file can be used for error checking; two files with the same
+ crc32 are considered equivalent. Note that the entire file must be read
+ to produce the crc32.
+
+ Args:
+ filename: string, path to a file
+ block_size: Integer, process the files by reading blocks of `block_size`
+ bytes. Use -1 to read the file as once.
+
+ Returns:
+ hexadecimal as string, the crc32 of the passed file.
+ """
+ crc = 0
+ with FileIO(filename, mode="rb") as f:
+ chunk = f.read(n=block_size)
+ while chunk:
+ crc = binascii.crc32(chunk, crc)
+ chunk = f.read(n=block_size)
+ return hex(crc & 0xFFFFFFFF)
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index 223858edfa..c21eb93103 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -491,5 +491,96 @@ class FileIoTest(test.TestCase):
v = file_io.file_exists(file_path)
self.assertEqual(v, True)
+ def testFilecmp(self):
+ file1 = os.path.join(self._base_dir, "file1")
+ file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+
+ file2 = os.path.join(self._base_dir, "file2")
+ file_io.write_string_to_file(file2, "This is another sentence\n" * 100)
+
+ file3 = os.path.join(self._base_dir, "file3")
+ file_io.write_string_to_file(file3, u"This is another sentence\n" * 100)
+
+ self.assertFalse(file_io.filecmp(file1, file2))
+ self.assertTrue(file_io.filecmp(file2, file3))
+
+ def testFilecmpSameSize(self):
+ file1 = os.path.join(self._base_dir, "file1")
+ file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+
+ file2 = os.path.join(self._base_dir, "file2")
+ file_io.write_string_to_file(file2, "This is b sentence\n" * 100)
+
+ file3 = os.path.join(self._base_dir, "file3")
+ file_io.write_string_to_file(file3, u"This is b sentence\n" * 100)
+
+ self.assertFalse(file_io.filecmp(file1, file2))
+ self.assertTrue(file_io.filecmp(file2, file3))
+
+ def testFilecmpBinary(self):
+ file1 = os.path.join(self._base_dir, "file1")
+ file_io.FileIO(file1, "wb").write("testing\n\na")
+
+ file2 = os.path.join(self._base_dir, "file2")
+ file_io.FileIO(file2, "wb").write("testing\n\nb")
+
+ file3 = os.path.join(self._base_dir, "file3")
+ file_io.FileIO(file3, "wb").write("testing\n\nb")
+
+ file4 = os.path.join(self._base_dir, "file4")
+ file_io.FileIO(file4, "wb").write("testing\n\ntesting")
+
+ self.assertFalse(file_io.filecmp(file1, file2))
+ self.assertFalse(file_io.filecmp(file1, file4))
+ self.assertTrue(file_io.filecmp(file2, file3))
+
+ def testFileCrc32(self):
+ file1 = os.path.join(self._base_dir, "file1")
+ file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+ crc1 = file_io.file_crc32(file1)
+
+ file2 = os.path.join(self._base_dir, "file2")
+ file_io.write_string_to_file(file2, "This is another sentence\n" * 100)
+ crc2 = file_io.file_crc32(file2)
+
+ file3 = os.path.join(self._base_dir, "file3")
+ file_io.write_string_to_file(file3, "This is another sentence\n" * 100)
+ crc3 = file_io.file_crc32(file3)
+
+ self.assertTrue(crc1 != crc2)
+ self.assertEqual(crc2, crc3)
+
+ def testFileCrc32WithBytes(self):
+ file1 = os.path.join(self._base_dir, "file1")
+ file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+ crc1 = file_io.file_crc32(file1, block_size=24)
+
+ file2 = os.path.join(self._base_dir, "file2")
+ file_io.write_string_to_file(file2, "This is another sentence\n" * 100)
+ crc2 = file_io.file_crc32(file2, block_size=24)
+
+ file3 = os.path.join(self._base_dir, "file3")
+ file_io.write_string_to_file(file3, "This is another sentence\n" * 100)
+ crc3 = file_io.file_crc32(file3, block_size=-1)
+
+ self.assertTrue(crc1 != crc2)
+ self.assertEqual(crc2, crc3)
+
+ def testFileCrc32Binary(self):
+ file1 = os.path.join(self._base_dir, "file1")
+ file_io.FileIO(file1, "wb").write("testing\n\n")
+ crc1 = file_io.file_crc32(file1)
+
+ file2 = os.path.join(self._base_dir, "file2")
+ file_io.FileIO(file2, "wb").write("testing\n\n\n")
+ crc2 = file_io.file_crc32(file2)
+
+ file3 = os.path.join(self._base_dir, "file3")
+ file_io.FileIO(file3, "wb").write("testing\n\n\n")
+ crc3 = file_io.file_crc32(file3)
+
+ self.assertTrue(crc1 != crc2)
+ self.assertEqual(crc2, crc3)
+
if __name__ == "__main__":
test.main()