diff options
Diffstat (limited to 'third_party/protobuf/3.6.0/java/core/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java')
-rw-r--r-- | third_party/protobuf/3.6.0/java/core/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/third_party/protobuf/3.6.0/java/core/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java b/third_party/protobuf/3.6.0/java/core/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java new file mode 100644 index 0000000000..1bcf63e7d9 --- /dev/null +++ b/third_party/protobuf/3.6.0/java/core/src/test/java/com/google/protobuf/IsValidUtf8TestUtil.java @@ -0,0 +1,457 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2008 Google Inc. All rights reserved. +// https://developers.google.com/protocol-buffers/ +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package com.google.protobuf; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.lang.ref.SoftReference; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.logging.Logger; + +/** + * Shared testing code for {@link IsValidUtf8Test} and + * {@link IsValidUtf8FourByteTest}. + * + * @author jonp@google.com (Jon Perlow) + * @author martinrb@google.com (Martin Buchholz) + */ +final class IsValidUtf8TestUtil { + private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName()); + + private IsValidUtf8TestUtil() {} + + static interface ByteStringFactory { + ByteString newByteString(byte[] bytes); + } + + static final ByteStringFactory LITERAL_FACTORY = new ByteStringFactory() { + @Override + public ByteString newByteString(byte[] bytes) { + return ByteString.wrap(bytes); + } + }; + + static final ByteStringFactory HEAP_NIO_FACTORY = new ByteStringFactory() { + @Override + public ByteString newByteString(byte[] bytes) { + return new NioByteString(ByteBuffer.wrap(bytes)); + } + }; + + private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer = + new ThreadLocal<SoftReference<ByteBuffer>>(); + + /** + * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this + * uses a thread local direct buffer. This means that each call will overwrite the buffer's + * contents from the previous call, so the calling code must be careful not to continue using + * a buffer returned from a previous invocation. + */ + static final ByteStringFactory DIRECT_NIO_FACTORY = new ByteStringFactory() { + @Override + public ByteString newByteString(byte[] bytes) { + SoftReference<ByteBuffer> ref = directBuffer.get(); + ByteBuffer buffer = ref == null ? null : ref.get(); + if (buffer == null || buffer.capacity() < bytes.length) { + buffer = ByteBuffer.allocateDirect(bytes.length); + directBuffer.set(new SoftReference<ByteBuffer>(buffer)); + } + buffer.clear(); + buffer.put(bytes); + buffer.flip(); + return new NioByteString(buffer); + } + }; + + // 128 - [chars 0x0000 to 0x007f] + static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1; + + // 128 + static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; + + // 1920 [chars 0x0080 to 0x07FF] + static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1; + + // 18,304 + static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = + // Both bytes are one byte characters + (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) + + // The possible number of two byte characters + TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; + + // 2048 + static final long THREE_BYTE_SURROGATES = 2 * 1024; + + // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] + static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = + 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; + + // 2,650,112 + static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = + // All one byte characters + (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) + + // One two byte character and a one byte character + 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + + // Three byte characters + THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; + + // 1,048,576 [chars 0x10000L to 0x10FFFF] + static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1; + + // 289,571,839 + static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = + // All one byte characters + (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) + + // One and three byte characters + 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + + // Two two byte characters + TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS + + // Permutations of one and two byte characters + 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + + + // Four byte characters + FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; + + static final class Shard { + final long index; + final long start; + final long lim; + final long expected; + + + public Shard(long index, long start, long lim, long expected) { + assertTrue(start < lim); + this.index = index; + this.start = start; + this.lim = lim; + this.expected = expected; + } + } + + static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES = + generateFourByteShardsExpectedRunnables(); + + private static long[] generateFourByteShardsExpectedRunnables() { + long[] expected = new long[128]; + + // 0-63 are all 5300224 + for (int i = 0; i <= 63; i++) { + expected[i] = 5300224; + } + + // 97-111 are all 2342912 + for (int i = 97; i <= 111; i++) { + expected[i] = 2342912; + } + + // 113-117 are all 1048576 + for (int i = 113; i <= 117; i++) { + expected[i] = 1048576; + } + + // One offs + expected[112] = 786432; + expected[118] = 786432; + expected[119] = 1048576; + expected[120] = 458752; + expected[121] = 524288; + expected[122] = 65536; + + // Anything not assigned was the default 0. + return expected; + } + + static final List<Shard> FOUR_BYTE_SHARDS = + generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES); + + + private static List<Shard> generateFourByteShards(int numShards, long[] expected) { + assertEquals(numShards, expected.length); + List<Shard> shards = new ArrayList<Shard>(numShards); + long LIM = 1L << 32; + long increment = LIM / numShards; + assertTrue(LIM % numShards == 0); + for (int i = 0; i < numShards; i++) { + shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i])); + } + return shards; + } + + /** + * Helper to run the loop to test all the permutations for the number of bytes + * specified. + * + * @param factory the factory for {@link ByteString} instances. + * @param numBytes the number of bytes in the byte array + * @param expectedCount the expected number of roundtrippable permutations + */ + static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) { + testBytes(factory, numBytes, expectedCount, 0, -1); + } + + /** + * Helper to run the loop to test all the permutations for the number of bytes + * specified. This overload is useful for debugging to get the loop to start + * at a certain character. + * + * @param factory the factory for {@link ByteString} instances. + * @param numBytes the number of bytes in the byte array + * @param expectedCount the expected number of roundtrippable permutations + * @param start the starting bytes encoded as a long as big-endian + * @param lim the limit of bytes to process encoded as a long as big-endian, + * or -1 to mean the max limit for numBytes + */ + static void testBytes( + ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { + Random rnd = new Random(); + byte[] bytes = new byte[numBytes]; + + if (lim == -1) { + lim = 1L << (numBytes * 8); + } + long count = 0; + long countRoundTripped = 0; + for (long byteChar = start; byteChar < lim; byteChar++) { + long tmpByteChar = byteChar; + for (int i = 0; i < numBytes; i++) { + bytes[bytes.length - i - 1] = (byte) tmpByteChar; + tmpByteChar = tmpByteChar >> 8; + } + ByteString bs = factory.newByteString(bytes); + boolean isRoundTrippable = bs.isValidUtf8(); + String s = new String(bytes, Internal.UTF_8); + byte[] bytesReencoded = s.getBytes(Internal.UTF_8); + boolean bytesEqual = Arrays.equals(bytes, bytesReencoded); + + if (bytesEqual != isRoundTrippable) { + outputFailure(byteChar, bytes, bytesReencoded); + } + + // Check agreement with static Utf8 methods. + assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes)); + assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes)); + + try { + assertEquals(s, Utf8.decodeUtf8(bytes, 0, numBytes)); + } catch (InvalidProtocolBufferException e) { + if (isRoundTrippable) { + System.out.println("Could not decode utf-8"); + outputFailure(byteChar, bytes, bytesReencoded); + } + } + + // Test partial sequences. + // Partition numBytes into three segments (not necessarily non-empty). + int i = rnd.nextInt(numBytes); + int j = rnd.nextInt(numBytes); + if (j < i) { + int tmp = i; + i = j; + j = tmp; + } + int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i); + int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j); + int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes); + if (isRoundTrippable != (state3 == Utf8.COMPLETE)) { + System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j); + outputFailure(byteChar, bytes, bytesReencoded); + } + assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE)); + + // Test ropes built out of small partial sequences + ByteString rope = RopeByteString.newInstanceForTest( + bs.substring(0, i), + RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes))); + assertSame(RopeByteString.class, rope.getClass()); + + ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope}; + for (ByteString x : byteStrings) { + assertEquals(isRoundTrippable, x.isValidUtf8()); + assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes)); + + assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i)); + assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i)); + assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i)); + assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i)); + assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j)); + assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j)); + } + + // ByteString reduplication should not affect its UTF-8 validity. + ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes)); + assertEquals(isRoundTrippable, ropeADope.isValidUtf8()); + + if (isRoundTrippable) { + countRoundTripped++; + } + count++; + if (byteChar != 0 && byteChar % 1000000L == 0) { + logger.info("Processed " + (byteChar / 1000000L) + " million characters"); + } + } + logger.info("Round tripped " + countRoundTripped + " of " + count); + assertEquals(expectedCount, countRoundTripped); + } + + /** + * Variation of {@link #testBytes} that does less allocation using the + * low-level encoders/decoders directly. Checked in because it's useful for + * debugging when trying to process bytes faster, but since it doesn't use the + * actual String class, it's possible for incompatibilities to develop + * (although unlikely). + * + * @param factory the factory for {@link ByteString} instances. + * @param numBytes the number of bytes in the byte array + * @param expectedCount the expected number of roundtrippable permutations + * @param start the starting bytes encoded as a long as big-endian + * @param lim the limit of bytes to process encoded as a long as big-endian, + * or -1 to mean the max limit for numBytes + */ + static void testBytesUsingByteBuffers( + ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) { + CharsetDecoder decoder = + Internal.UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + CharsetEncoder encoder = + Internal.UTF_8.newEncoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + byte[] bytes = new byte[numBytes]; + int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1; + char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1]; + int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1; + byte[] bytesReencoded = new byte[maxBytes]; + + ByteBuffer bb = ByteBuffer.wrap(bytes); + CharBuffer cb = CharBuffer.wrap(charsDecoded); + ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded); + if (lim == -1) { + lim = 1L << (numBytes * 8); + } + long count = 0; + long countRoundTripped = 0; + for (long byteChar = start; byteChar < lim; byteChar++) { + bb.rewind(); + bb.limit(bytes.length); + cb.rewind(); + cb.limit(charsDecoded.length); + bbReencoded.rewind(); + bbReencoded.limit(bytesReencoded.length); + encoder.reset(); + decoder.reset(); + long tmpByteChar = byteChar; + for (int i = 0; i < bytes.length; i++) { + bytes[bytes.length - i - 1] = (byte) tmpByteChar; + tmpByteChar = tmpByteChar >> 8; + } + boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8(); + CoderResult result = decoder.decode(bb, cb, true); + assertFalse(result.isError()); + result = decoder.flush(cb); + assertFalse(result.isError()); + + int charLen = cb.position(); + cb.rewind(); + cb.limit(charLen); + result = encoder.encode(cb, bbReencoded, true); + assertFalse(result.isError()); + result = encoder.flush(bbReencoded); + assertFalse(result.isError()); + + boolean bytesEqual = true; + int bytesLen = bbReencoded.position(); + if (bytesLen != numBytes) { + bytesEqual = false; + } else { + for (int i = 0; i < numBytes; i++) { + if (bytes[i] != bytesReencoded[i]) { + bytesEqual = false; + break; + } + } + } + if (bytesEqual != isRoundTrippable) { + outputFailure(byteChar, bytes, bytesReencoded, bytesLen); + } + + count++; + if (isRoundTrippable) { + countRoundTripped++; + } + if (byteChar != 0 && byteChar % 1000000 == 0) { + logger.info("Processed " + (byteChar / 1000000) + " million characters"); + } + } + logger.info("Round tripped " + countRoundTripped + " of " + count); + assertEquals(expectedCount, countRoundTripped); + } + + private static void outputFailure(long byteChar, byte[] bytes, byte[] after) { + outputFailure(byteChar, bytes, after, after.length); + } + + private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) { + fail("Failure: (" + Long.toHexString(byteChar) + ") " + toHexString(bytes) + " => " + + toHexString(after, len)); + } + + private static String toHexString(byte[] b) { + return toHexString(b, b.length); + } + + private static String toHexString(byte[] b, int len) { + StringBuilder s = new StringBuilder(); + s.append("\""); + for (int i = 0; i < len; i++) { + if (i > 0) { + s.append(" "); + } + s.append(String.format("%02x", b[i] & 0xFF)); + } + s.append("\""); + return s.toString(); + } +} |