diff options
author | Karmel Allison <karmel@google.com> | 2018-05-23 20:53:15 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-05-23 20:56:01 -0700 |
commit | 81ef70a0bc22163d34f1e0425122d6a93bf02eac (patch) | |
tree | c54ca4015958c17fc45573f1a553ff8d7907f451 /tensorflow/python/lib | |
parent | 8f863f3d71542c47390f2d40348b72296ed5c4be (diff) |
Resolve name collisions with assets in SavedModels by deduplicating names that
point to distinct files.
PiperOrigin-RevId: 197835288
Diffstat (limited to 'tensorflow/python/lib')
-rw-r--r-- | tensorflow/python/lib/io/file_io.py | 58 | ||||
-rw-r--r-- | tensorflow/python/lib/io/file_io_test.py | 91 |
2 files changed, 149 insertions, 0 deletions
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py index 59f5075f17..f22fb253e4 100644 --- a/tensorflow/python/lib/io/file_io.py +++ b/tensorflow/python/lib/io/file_io.py @@ -21,6 +21,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import binascii import os import uuid @@ -33,6 +34,10 @@ from tensorflow.python.util import compat from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export +# A good default block size depends on the system in question. +# A somewhat conservative default chosen here. +_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024 + class FileIO(object): """FileIO class that exposes methods to read / write to / from files. @@ -551,3 +556,56 @@ def stat(filename): with errors.raise_exception_on_not_ok_status() as status: pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status) return file_statistics + + +def filecmp(filename_a, filename_b): + """Compare two files, returning True if they are the same, False otherwise. + + We check size first and return False quickly if the files are different sizes. + If they are the same size, we continue to generating a crc for the whole file. + + You might wonder: why not use Python's filecmp.cmp() instead? The answer is + that the builtin library is not robust to the many different filesystems + TensorFlow runs on, and so we here perform a similar comparison with + the more robust FileIO. + + Args: + filename_a: string path to the first file. + filename_b: string path to the second file. + + Returns: + True if the files are the same, False otherwise. + """ + size_a = FileIO(filename_a, "rb").size() + size_b = FileIO(filename_b, "rb").size() + if size_a != size_b: + return False + + # Size is the same. Do a full check. + crc_a = file_crc32(filename_a) + crc_b = file_crc32(filename_b) + return crc_a == crc_b + + +def file_crc32(filename, block_size=_DEFAULT_BLOCK_SIZE): + """Get the crc32 of the passed file. + + The crc32 of a file can be used for error checking; two files with the same + crc32 are considered equivalent. Note that the entire file must be read + to produce the crc32. + + Args: + filename: string, path to a file + block_size: Integer, process the files by reading blocks of `block_size` + bytes. Use -1 to read the file as once. + + Returns: + hexadecimal as string, the crc32 of the passed file. + """ + crc = 0 + with FileIO(filename, mode="rb") as f: + chunk = f.read(n=block_size) + while chunk: + crc = binascii.crc32(chunk, crc) + chunk = f.read(n=block_size) + return hex(crc & 0xFFFFFFFF) diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py index 223858edfa..c21eb93103 100644 --- a/tensorflow/python/lib/io/file_io_test.py +++ b/tensorflow/python/lib/io/file_io_test.py @@ -491,5 +491,96 @@ class FileIoTest(test.TestCase): v = file_io.file_exists(file_path) self.assertEqual(v, True) + def testFilecmp(self): + file1 = os.path.join(self._base_dir, "file1") + file_io.write_string_to_file(file1, "This is a sentence\n" * 100) + + file2 = os.path.join(self._base_dir, "file2") + file_io.write_string_to_file(file2, "This is another sentence\n" * 100) + + file3 = os.path.join(self._base_dir, "file3") + file_io.write_string_to_file(file3, u"This is another sentence\n" * 100) + + self.assertFalse(file_io.filecmp(file1, file2)) + self.assertTrue(file_io.filecmp(file2, file3)) + + def testFilecmpSameSize(self): + file1 = os.path.join(self._base_dir, "file1") + file_io.write_string_to_file(file1, "This is a sentence\n" * 100) + + file2 = os.path.join(self._base_dir, "file2") + file_io.write_string_to_file(file2, "This is b sentence\n" * 100) + + file3 = os.path.join(self._base_dir, "file3") + file_io.write_string_to_file(file3, u"This is b sentence\n" * 100) + + self.assertFalse(file_io.filecmp(file1, file2)) + self.assertTrue(file_io.filecmp(file2, file3)) + + def testFilecmpBinary(self): + file1 = os.path.join(self._base_dir, "file1") + file_io.FileIO(file1, "wb").write("testing\n\na") + + file2 = os.path.join(self._base_dir, "file2") + file_io.FileIO(file2, "wb").write("testing\n\nb") + + file3 = os.path.join(self._base_dir, "file3") + file_io.FileIO(file3, "wb").write("testing\n\nb") + + file4 = os.path.join(self._base_dir, "file4") + file_io.FileIO(file4, "wb").write("testing\n\ntesting") + + self.assertFalse(file_io.filecmp(file1, file2)) + self.assertFalse(file_io.filecmp(file1, file4)) + self.assertTrue(file_io.filecmp(file2, file3)) + + def testFileCrc32(self): + file1 = os.path.join(self._base_dir, "file1") + file_io.write_string_to_file(file1, "This is a sentence\n" * 100) + crc1 = file_io.file_crc32(file1) + + file2 = os.path.join(self._base_dir, "file2") + file_io.write_string_to_file(file2, "This is another sentence\n" * 100) + crc2 = file_io.file_crc32(file2) + + file3 = os.path.join(self._base_dir, "file3") + file_io.write_string_to_file(file3, "This is another sentence\n" * 100) + crc3 = file_io.file_crc32(file3) + + self.assertTrue(crc1 != crc2) + self.assertEqual(crc2, crc3) + + def testFileCrc32WithBytes(self): + file1 = os.path.join(self._base_dir, "file1") + file_io.write_string_to_file(file1, "This is a sentence\n" * 100) + crc1 = file_io.file_crc32(file1, block_size=24) + + file2 = os.path.join(self._base_dir, "file2") + file_io.write_string_to_file(file2, "This is another sentence\n" * 100) + crc2 = file_io.file_crc32(file2, block_size=24) + + file3 = os.path.join(self._base_dir, "file3") + file_io.write_string_to_file(file3, "This is another sentence\n" * 100) + crc3 = file_io.file_crc32(file3, block_size=-1) + + self.assertTrue(crc1 != crc2) + self.assertEqual(crc2, crc3) + + def testFileCrc32Binary(self): + file1 = os.path.join(self._base_dir, "file1") + file_io.FileIO(file1, "wb").write("testing\n\n") + crc1 = file_io.file_crc32(file1) + + file2 = os.path.join(self._base_dir, "file2") + file_io.FileIO(file2, "wb").write("testing\n\n\n") + crc2 = file_io.file_crc32(file2) + + file3 = os.path.join(self._base_dir, "file3") + file_io.FileIO(file3, "wb").write("testing\n\n\n") + crc3 = file_io.file_crc32(file3) + + self.assertTrue(crc1 != crc2) + self.assertEqual(crc2, crc3) + if __name__ == "__main__": test.main() |