libdex/DexUtf.cpp - platform/dalvik - Git at Google

 /*
  * Copyright (C) 2011 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /*
  * Validate and manipulate MUTF-8 encoded string data.
  */

 #include "DexUtf.h"

 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
  * code point values for comparison. This treats different encodings
  * for the same code point as equivalent, except that only a real '\0'
  * byte is considered the string terminator. The return value is as
  * for strcmp(). */
 int dexUtf8Cmp(const char* s1, const char* s2) {
     for (;;) {
         if (*s1 == '\0') {
             if (*s2 == '\0') {
                 return 0;
             }
             return -1;
         } else if (*s2 == '\0') {
             return 1;
         }

         int utf1 = dexGetUtf16FromUtf8(&s1);
         int utf2 = dexGetUtf16FromUtf8(&s2);
         int diff = utf1 - utf2;

         if (diff != 0) {
             return diff;
         }
     }
 }

 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
 u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
     0x00000000, // 00..1f low control characters; nothing valid
     0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
     0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
     0x07fffffe  // 60..7f lowercase etc.; valid: 'a'..'z'
 };

 /* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
     /*
      * It's a multibyte encoded character. Decode it and analyze. We
      * accept anything that isn't (a) an improperly encoded low value,
      * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
      * control character, or (e) a high space, layout, or special
      * character (U+00a0, U+2000..U+200f, U+2028..U+202f,
      * U+fff0..U+ffff). This is all specified in the dex format
      * document.
      */

     u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);

     // Perform follow-up tests based on the high 8 bits.
     switch (utf16 >> 8) {
         case 0x00: {
             // It's only valid if it's above the ISO-8859-1 high space (0xa0).
             return (utf16 > 0x00a0);
         }
         case 0xd8:
         case 0xd9:
         case 0xda:
         case 0xdb: {
             /*
              * It's a leading surrogate. Check to see that a trailing
              * surrogate follows.
              */
             utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
             return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
         }
         case 0xdc:
         case 0xdd:
         case 0xde:
         case 0xdf: {
             // It's a trailing surrogate, which is not valid at this point.
             return false;
         }
         case 0x20:
         case 0xff: {
             // It's in the range that has spaces, controls, and specials.
             switch (utf16 & 0xfff8) {
                 case 0x2000:
                 case 0x2008:
                 case 0x2028:
                 case 0xfff0:
                 case 0xfff8: {
                     return false;
                 }
             }
             break;
         }
     }

     return true;
 }

 /* Return whether the given string is a valid field or method name. */
 bool dexIsValidMemberName(const char* s) {
     bool angleName = false;

     switch (*s) {
         case '\0': {
             // The empty string is not a valid name.
             return false;
         }
         case '<': {
             /*
              * '<' is allowed only at the start of a name, and if present,
              * means that the name must end with '>'.
              */
             angleName = true;
             s++;
             break;
         }
     }

     for (;;) {
         switch (*s) {
             case '\0': {
                 return !angleName;
             }
             case '>': {
                 return angleName && s[1] == '\0';
             }
         }
         if (!dexIsValidMemberNameUtf8(&s)) {
             return false;
         }
     }
 }

 /* Helper for validating type descriptors and class names, which is parametric
  * with respect to type vs. class and dot vs. slash. */
 static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
         bool dotSeparator) {
     int arrayCount = 0;

     while (*s == '[') {
         arrayCount++;
         s++;
     }

     if (arrayCount > 255) {
         // Arrays may have no more than 255 dimensions.
         return false;
     }

     if (arrayCount != 0) {
         /*
          * If we're looking at an array of some sort, then it doesn't
          * matter if what is being asked for is a class name; the
          * format looks the same as a type descriptor in that case, so
          * treat it as such.
          */
         isClassName = false;
     }

     if (!isClassName) {
         /*
          * We are looking for a descriptor. Either validate it as a
          * single-character primitive type, or continue on to check the
          * embedded class name (bracketed by "L" and ";").
          */
         switch (*(s++)) {
             case 'B':
             case 'C':
             case 'D':
             case 'F':
             case 'I':
             case 'J':
             case 'S':
             case 'Z': {
                 // These are all single-character descriptors for primitive types.
                 return (*s == '\0');
             }
             case 'V': {
                 // Non-array void is valid, but you can't have an array of void.
                 return (arrayCount == 0) && (*s == '\0');
             }
             case 'L': {
                 // Class name: Break out and continue below.
                 break;
             }
             default: {
                 // Oddball descriptor character.
                 return false;
             }
         }
     }

     /*
      * We just consumed the 'L' that introduces a class name as part
      * of a type descriptor, or we are looking for an unadorned class
      * name.
      */

     bool sepOrFirst = true; // first character or just encountered a separator.
     for (;;) {
         u1 c = (u1) *s;
         switch (c) {
             case '\0': {
                 /*
                  * Premature end for a type descriptor, but valid for
                  * a class name as long as we haven't encountered an
                  * empty component (including the degenerate case of
                  * the empty string "").
                  */
                 return isClassName && !sepOrFirst;
             }
             case ';': {
                 /*
                  * Invalid character for a class name, but the
                  * legitimate end of a type descriptor. In the latter
                  * case, make sure that this is the end of the string
                  * and that it doesn't end with an empty component
                  * (including the degenerate case of "L;").
                  */
                 return !isClassName && !sepOrFirst && (s[1] == '\0');
             }
             case '/':
             case '.': {
                 if (dotSeparator != (c == '.')) {
                     // The wrong separator character.
                     return false;
                 }
                 if (sepOrFirst) {
                     // Separator at start or two separators in a row.
                     return false;
                 }
                 sepOrFirst = true;
                 s++;
                 break;
             }
             default: {
                 if (!dexIsValidMemberNameUtf8(&s)) {
                     return false;
                 }
                 sepOrFirst = false;
                 break;
             }
         }
     }
 }

 /* Return whether the given string is a valid type descriptor. */
 bool dexIsValidTypeDescriptor(const char* s) {
     return isValidTypeDescriptorOrClassName(s, false, false);
 }

 /* (documented in header) */
 bool dexIsValidClassName(const char* s, bool dotSeparator) {
     return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
 }

 /* Return whether the given string is a valid reference descriptor. This
  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
  * is for a class or array and not a primitive type. */
 bool dexIsReferenceDescriptor(const char* s) {
     if (!dexIsValidTypeDescriptor(s)) {
         return false;
     }

     return (s[0] == 'L') || (s[0] == '[');
 }

 /* Return whether the given string is a valid class descriptor. This
  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
  * is for a class and not an array or primitive type. */
 bool dexIsClassDescriptor(const char* s) {
     if (!dexIsValidTypeDescriptor(s)) {
         return false;
     }

     return s[0] == 'L';
 }

 /* Return whether the given string is a valid field type descriptor. This
  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
  * is for anything but "void". */
 bool dexIsFieldDescriptor(const char* s) {
     if (!dexIsValidTypeDescriptor(s)) {
         return false;
     }

     return s[0] != 'V';
 }
	/*
	* Copyright (C) 2011 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/*
	* Validate and manipulate MUTF-8 encoded string data.
	*/

	#include "DexUtf.h"

	/* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
	* code point values for comparison. This treats different encodings
	* for the same code point as equivalent, except that only a real '\0'
	* byte is considered the string terminator. The return value is as
	* for strcmp(). */
	int dexUtf8Cmp(const char* s1, const char* s2) {
	for (;;) {
	if (*s1 == '\0') {
	if (*s2 == '\0') {
	return 0;
	}
	return -1;
	} else if (*s2 == '\0') {
	return 1;
	}

	int utf1 = dexGetUtf16FromUtf8(&s1);
	int utf2 = dexGetUtf16FromUtf8(&s2);
	int diff = utf1 - utf2;

	if (diff != 0) {
	return diff;
	}
	}
	}

	/* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
	u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
	0x00000000, // 00..1f low control characters; nothing valid
	0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
	0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
	0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z'
	};

	/* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
	bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
	/*
	* It's a multibyte encoded character. Decode it and analyze. We
	* accept anything that isn't (a) an improperly encoded low value,
	* (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
	* control character, or (e) a high space, layout, or special
	* character (U+00a0, U+2000..U+200f, U+2028..U+202f,
	* U+fff0..U+ffff). This is all specified in the dex format
	* document.
	*/

	u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);

	// Perform follow-up tests based on the high 8 bits.
	switch (utf16 >> 8) {
	case 0x00: {
	// It's only valid if it's above the ISO-8859-1 high space (0xa0).
	return (utf16 > 0x00a0);
	}
	case 0xd8:
	case 0xd9:
	case 0xda:
	case 0xdb: {
	/*
	* It's a leading surrogate. Check to see that a trailing
	* surrogate follows.
	*/
	utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
	return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
	}
	case 0xdc:
	case 0xdd:
	case 0xde:
	case 0xdf: {
	// It's a trailing surrogate, which is not valid at this point.
	return false;
	}
	case 0x20:
	case 0xff: {
	// It's in the range that has spaces, controls, and specials.
	switch (utf16 & 0xfff8) {
	case 0x2000:
	case 0x2008:
	case 0x2028:
	case 0xfff0:
	case 0xfff8: {
	return false;
	}
	}
	break;
	}
	}

	return true;
	}

	/* Return whether the given string is a valid field or method name. */
	bool dexIsValidMemberName(const char* s) {
	bool angleName = false;

	switch (*s) {
	case '\0': {
	// The empty string is not a valid name.
	return false;
	}
	case '<': {
	/*
	* '<' is allowed only at the start of a name, and if present,
	* means that the name must end with '>'.
	*/
	angleName = true;
	s++;
	break;
	}
	}

	for (;;) {
	switch (*s) {
	case '\0': {
	return !angleName;
	}
	case '>': {
	return angleName && s[1] == '\0';
	}
	}
	if (!dexIsValidMemberNameUtf8(&s)) {
	return false;
	}
	}
	}

	/* Helper for validating type descriptors and class names, which is parametric
	* with respect to type vs. class and dot vs. slash. */
	static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
	bool dotSeparator) {
	int arrayCount = 0;

	while (*s == '[') {
	arrayCount++;
	s++;
	}

	if (arrayCount > 255) {
	// Arrays may have no more than 255 dimensions.
	return false;
	}

	if (arrayCount != 0) {
	/*
	* If we're looking at an array of some sort, then it doesn't
	* matter if what is being asked for is a class name; the
	* format looks the same as a type descriptor in that case, so
	* treat it as such.
	*/
	isClassName = false;
	}

	if (!isClassName) {
	/*
	* We are looking for a descriptor. Either validate it as a
	* single-character primitive type, or continue on to check the
	* embedded class name (bracketed by "L" and ";").
	*/
	switch (*(s++)) {
	case 'B':
	case 'C':
	case 'D':
	case 'F':
	case 'I':
	case 'J':
	case 'S':
	case 'Z': {
	// These are all single-character descriptors for primitive types.
	return (*s == '\0');
	}
	case 'V': {
	// Non-array void is valid, but you can't have an array of void.
	return (arrayCount == 0) && (*s == '\0');
	}
	case 'L': {
	// Class name: Break out and continue below.
	break;
	}
	default: {
	// Oddball descriptor character.
	return false;
	}
	}
	}

	/*
	* We just consumed the 'L' that introduces a class name as part
	* of a type descriptor, or we are looking for an unadorned class
	* name.
	*/

	bool sepOrFirst = true; // first character or just encountered a separator.
	for (;;) {
	u1 c = (u1) *s;
	switch (c) {
	case '\0': {
	/*
	* Premature end for a type descriptor, but valid for
	* a class name as long as we haven't encountered an
	* empty component (including the degenerate case of
	* the empty string "").
	*/
	return isClassName && !sepOrFirst;
	}
	case ';': {
	/*
	* Invalid character for a class name, but the
	* legitimate end of a type descriptor. In the latter
	* case, make sure that this is the end of the string
	* and that it doesn't end with an empty component
	* (including the degenerate case of "L;").
	*/
	return !isClassName && !sepOrFirst && (s[1] == '\0');
	}
	case '/':
	case '.': {
	if (dotSeparator != (c == '.')) {
	// The wrong separator character.
	return false;
	}
	if (sepOrFirst) {
	// Separator at start or two separators in a row.
	return false;
	}
	sepOrFirst = true;
	s++;
	break;
	}
	default: {
	if (!dexIsValidMemberNameUtf8(&s)) {
	return false;
	}
	sepOrFirst = false;
	break;
	}
	}
	}
	}

	/* Return whether the given string is a valid type descriptor. */
	bool dexIsValidTypeDescriptor(const char* s) {
	return isValidTypeDescriptorOrClassName(s, false, false);
	}

	/* (documented in header) */
	bool dexIsValidClassName(const char* s, bool dotSeparator) {
	return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
	}

	/* Return whether the given string is a valid reference descriptor. This
	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
	* is for a class or array and not a primitive type. */
	bool dexIsReferenceDescriptor(const char* s) {
	if (!dexIsValidTypeDescriptor(s)) {
	return false;
	}

	return (s[0] == 'L') \|\| (s[0] == '[');
	}

	/* Return whether the given string is a valid class descriptor. This
	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
	* is for a class and not an array or primitive type. */
	bool dexIsClassDescriptor(const char* s) {
	if (!dexIsValidTypeDescriptor(s)) {
	return false;
	}

	return s[0] == 'L';
	}

	/* Return whether the given string is a valid field type descriptor. This
	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
	* is for anything but "void". */
	bool dexIsFieldDescriptor(const char* s) {
	if (!dexIsValidTypeDescriptor(s)) {
	return false;
	}

	return s[0] != 'V';
	}