1 files changed, 148 insertions, 0 deletions
diff --git a/third_party/bson_c/src/encoding.c b/third_party/bson_c/src/encoding.c
new file mode 100644
index 0000000000..8d2da1502f
--- /dev/null
+++ b/third_party/bson_c/src/encoding.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2009-2011 10gen, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright 2001 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+
+#include "bson.h"
+#include "encoding.h"
+
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ */
+static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Utility routine to tell whether a sequence of bytes is legal UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * The length can be set by:
+ *  length = trailingBytesForUTF8[*source]+1;
+ * and the sequence is illegal right away if there aren't that many bytes
+ * available.
+ * If presented with a length > 4, this returns 0.  The Unicode
+ * definition of UTF-8 goes up to 4-byte sequences.
+ */
+static int isLegalUTF8( const unsigned char *source, int length ) {
+    unsigned char a;
+    const unsigned char *srcptr = source + length;
+    switch ( length ) {
+    default:
+        return 0;
+        /* Everything else falls through when "true"... */
+    case 4:
+        if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
+    case 3:
+        if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0;
+    case 2:
+        if ( ( a = ( *--srcptr ) ) > 0xBF ) return 0;
+        switch ( *source ) {
+            /* no fall-through in this inner switch */
+        case 0xE0:
+            if ( a < 0xA0 ) return 0;
+            break;
+        case 0xF0:
+            if ( a < 0x90 ) return 0;
+            break;
+        case 0xF4:
+            if ( a > 0x8F ) return 0;
+            break;
+        default:
+            if ( a < 0x80 ) return 0;
+        }
+    case 1:
+        if ( *source >= 0x80 && *source < 0xC2 ) return 0;
+        if ( *source > 0xF4 ) return 0;
+    }
+    return 1;
+}
+
+static int bson_validate_string( bson *b, const unsigned char *string,
+                                 const int length, const char check_utf8, const char check_dot,
+                                 const char check_dollar ) {
+
+    int position = 0;
+    int sequence_length = 1;
+
+    if( check_dollar && string[0] == '$' ) {
+        b->err |= BSON_FIELD_INIT_DOLLAR;
+    }
+
+    while ( position < length ) {
+        if ( check_dot && *( string + position ) == '.' ) {
+            b->err |= BSON_FIELD_HAS_DOT;
+        }
+
+        if ( check_utf8 ) {
+            sequence_length = trailingBytesForUTF8[*( string + position )] + 1;
+            if ( ( position + sequence_length ) > length ) {
+                b->err |= BSON_NOT_UTF8;
+                return BSON_ERROR;
+            }
+            if ( !isLegalUTF8( string + position, sequence_length ) ) {
+                b->err |= BSON_NOT_UTF8;
+                return BSON_ERROR;
+            }
+        }
+        position += sequence_length;
+    }
+
+    return BSON_OK;
+}
+
+
+int bson_check_string( bson *b, const char *string,
+                       const int length ) {
+
+    return bson_validate_string( b, ( const unsigned char * )string, length, 1, 0, 0 );
+}
+
+int bson_check_field_name( bson *b, const char *string,
+                           const int length ) {
+
+    return bson_validate_string( b, ( const unsigned char * )string, length, 1, 1, 1 );
+}