diff options
Diffstat (limited to 'third_party/bson_c/src/encoding.c')
-rw-r--r-- | third_party/bson_c/src/encoding.c | 148 |
1 files changed, 148 insertions, 0 deletions
diff --git a/third_party/bson_c/src/encoding.c b/third_party/bson_c/src/encoding.c new file mode 100644 index 0000000000..8d2da1502f --- /dev/null +++ b/third_party/bson_c/src/encoding.c @@ -0,0 +1,148 @@ +/* + * Copyright 2009-2011 10gen, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Portions Copyright 2001 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +#include "bson.h" +#include "encoding.h" + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * The length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns 0. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ +static int isLegalUTF8( const unsigned char *source, int length ) { + unsigned char a; + const unsigned char *srcptr = source + length; + switch ( length ) { + default: + return 0; + /* Everything else falls through when "true"... */ + case 4: + if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0; + case 3: + if ( ( a = ( *--srcptr ) ) < 0x80 || a > 0xBF ) return 0; + case 2: + if ( ( a = ( *--srcptr ) ) > 0xBF ) return 0; + switch ( *source ) { + /* no fall-through in this inner switch */ + case 0xE0: + if ( a < 0xA0 ) return 0; + break; + case 0xF0: + if ( a < 0x90 ) return 0; + break; + case 0xF4: + if ( a > 0x8F ) return 0; + break; + default: + if ( a < 0x80 ) return 0; + } + case 1: + if ( *source >= 0x80 && *source < 0xC2 ) return 0; + if ( *source > 0xF4 ) return 0; + } + return 1; +} + +static int bson_validate_string( bson *b, const unsigned char *string, + const int length, const char check_utf8, const char check_dot, + const char check_dollar ) { + + int position = 0; + int sequence_length = 1; + + if( check_dollar && string[0] == '$' ) { + b->err |= BSON_FIELD_INIT_DOLLAR; + } + + while ( position < length ) { + if ( check_dot && *( string + position ) == '.' ) { + b->err |= BSON_FIELD_HAS_DOT; + } + + if ( check_utf8 ) { + sequence_length = trailingBytesForUTF8[*( string + position )] + 1; + if ( ( position + sequence_length ) > length ) { + b->err |= BSON_NOT_UTF8; + return BSON_ERROR; + } + if ( !isLegalUTF8( string + position, sequence_length ) ) { + b->err |= BSON_NOT_UTF8; + return BSON_ERROR; + } + } + position += sequence_length; + } + + return BSON_OK; +} + + +int bson_check_string( bson *b, const char *string, + const int length ) { + + return bson_validate_string( b, ( const unsigned char * )string, length, 1, 0, 0 ); +} + +int bson_check_field_name( bson *b, const char *string, + const int length ) { + + return bson_validate_string( b, ( const unsigned char * )string, length, 1, 1, 1 ); +} |