blob: 7001024b491177c491eaa2bb40967f46d843ebea [file] [log] [blame]
/* ------------------------------------------------------------------
* Copyright (C) 1998-2009 PacketVideo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied.
* See the License for the specific language governing permissions
* and limitations under the License.
* -------------------------------------------------------------------
*/
// -*- c++ -*-
// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
// O S C L UTF-8 S T R I N G FUNCTIONS
// = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
// - - Inclusion - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#include "oscl_string_utf8.h"
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#define UTF8_1_BYTE_MASK 0x80 //1000 0000 -> 0xxx xxxx
#define UTF8_1_BYTE_SIG 0x00 //0000 0000 -> 0xxx xxxx
#define UTF8_2_BYTES_MASK_1 0xE0 //1110 0000 -> 110y yyyy
#define UTF8_2_BYTES_MASK_2 0xC0 //1100 0000 -> 10xx xxxx
#define UTF8_2_BYTES_SIG_1 0xC0 //1100 0000 -> 110y yyyy
#define UTF8_2_BYTES_SIG_2 0x80 //1000 0000 -> 10xx xxxx
#define UTF8_3_BYTES_MASK_1 0xF0 //1111 0000 -> 1110 zzzz
#define UTF8_3_BYTES_MASK_2 0xC0 //1100 0000 -> 10yy yyyy
#define UTF8_3_BYTES_MASK_3 0xC0 //1100 0000 -> 10xx xxxx
#define UTF8_3_BYTES_SIG_1 0xE0 //1110 0000 -> 1110 zzzz
#define UTF8_3_BYTES_SIG_2 0x80 //1000 0000 -> 10yy yyyy
#define UTF8_3_BYTES_SIG_3 0x80 //1000 0000 -> 10xx xxxx
#define UTF8_4_BYTES_MASK_1 0xF8 //1111 1000 -> 1111 0uuu
#define UTF8_4_BYTES_MASK_2 0xC0 //1100 0000 -> 10uu zzzz
#define UTF8_4_BYTES_MASK_3 0xC0 //1100 0000 -> 10yy yyyy
#define UTF8_4_BYTES_MASK_4 0xC0 //1100 0000 -> 10xx xxxx
#define UTF8_4_BYTES_SIG_1 0xF0 //1111 0000 -> 1111 0uuu
#define UTF8_4_BYTES_SIG_2 0x80 //1000 0000 -> 10uu zzzz
#define UTF8_4_BYTES_SIG_3 0x80 //1000 0000 -> 10yy yyyy
#define UTF8_4_BYTES_SIG_4 0x80 //1000 0000 -> 10xx xxxx
/* ========================================================================
* Function : oscl_str_is_valid_and_truncate_utf8
* Date : 10/25/2002
* Purpose :
* Modified :
* ========================================================================
*/
OSCL_EXPORT_REF bool oscl_str_is_valid_and_truncate_utf8(const uint8 *str_buf, uint32& num_valid_characters, uint32 max_bytes, uint32& truncate_pos_index,
uint32 max_char_2_valid = 0)
{
bool result = true, done = false;
uint32 bytes_left;
uint32 total_num_valid_characters;
const uint8 *ptr;
ptr = str_buf;
bytes_left = max_bytes;
total_num_valid_characters = 0;
truncate_pos_index = 0;
while (!done)
{
if ((*ptr & UTF8_1_BYTE_MASK) == UTF8_1_BYTE_SIG)
{
if ((*ptr == 0x0) && (max_bytes == 0)) //if need to be terminated at first null char
{
result = true;
done = true;
continue;
}
if (bytes_left > 0)
bytes_left--;
if (max_bytes && !bytes_left) //done when there is no more data
done = true;
ptr++;
total_num_valid_characters ++;
truncate_pos_index += 1;
}
else if ((*ptr & UTF8_2_BYTES_MASK_1) == UTF8_2_BYTES_SIG_1)
{
if ((!max_bytes) || (bytes_left >= 2))
{
if (((*(ptr + 1) & UTF8_2_BYTES_MASK_2) != UTF8_2_BYTES_SIG_2)
|| (*ptr < 0xC2) //C0, C1 are illegal
)
{ //invalid char
done = true;
result = false;
continue;
}
if (bytes_left)
bytes_left -= 2;
if (max_bytes && !bytes_left) //done when there is no more data
done = true;
ptr += 2;
total_num_valid_characters++;
truncate_pos_index += 2;
}
else
{
done = true;
result = false;
}
}
else if ((*ptr & UTF8_3_BYTES_MASK_1) == UTF8_3_BYTES_SIG_1)
{
if ((!max_bytes) || (bytes_left >= 3))
{
if (((*(ptr + 1) & UTF8_3_BYTES_MASK_2) != UTF8_3_BYTES_SIG_2)
|| ((*(ptr + 2) & UTF8_3_BYTES_MASK_3) != UTF8_3_BYTES_SIG_3)
|| ((*ptr == 0xE0) && (*(ptr + 1) < 0xA0)) //if 1st byte =E0, 2n byte has to be greater than 0xA0
)
{ //invalid char
done = true;
result = false;
continue;
}
if (bytes_left > 0)
bytes_left -= 3;
if (max_bytes && !bytes_left) //done when there is no more data
done = true;
ptr += 3;
total_num_valid_characters++;
truncate_pos_index += 3;
}
else
{
done = true;
result = false;
}
}
else if ((*ptr & UTF8_4_BYTES_MASK_1) == UTF8_4_BYTES_SIG_1)
{
if ((!max_bytes) || (bytes_left >= 4))
{
if (((*(ptr + 1) & UTF8_4_BYTES_MASK_2) != UTF8_4_BYTES_SIG_2)
|| ((*(ptr + 2) & UTF8_4_BYTES_MASK_3) != UTF8_4_BYTES_SIG_3)
|| ((*(ptr + 3) & UTF8_4_BYTES_MASK_4) != UTF8_4_BYTES_SIG_4)
|| (*ptr > 0xF4)
|| ((*ptr == 0xF4) && (*(ptr + 1) > 0x8F))
)
{ //invalid char
done = true;
result = false;
continue;
}
if (bytes_left > 0)
bytes_left -= 4;
if (max_bytes && !bytes_left) //done when there is no more data
done = true;
ptr += 4;
total_num_valid_characters++;
truncate_pos_index += 4;
}
else
{
done = true;
result = false;
}
}
else //found invalid char
{
done = true;
result = false;
if (bytes_left > 0)
bytes_left--;
}
if (num_valid_characters && (total_num_valid_characters == num_valid_characters))
done = true;
if ((!num_valid_characters) && max_char_2_valid && (total_num_valid_characters == max_char_2_valid))
done = true;
}
num_valid_characters = total_num_valid_characters;
return result;
}
/* ========================================================================
* Function : oscl_str_is_valid_utf8
* Date : 10/25/2002
* Purpose : see oscl_string_utf8.h
* Modified :
* ========================================================================
*/
OSCL_EXPORT_REF bool oscl_str_is_valid_utf8(const uint8 *str_buf, uint32& num_valid_characters, uint32 max_bytes,
uint32 max_char_2_valid, uint32 *num_byte_4_char)
{
uint32 truncate_pos_index = 0;
num_valid_characters = 0;
bool result;
if (num_byte_4_char)
*num_byte_4_char = 0;
result = oscl_str_is_valid_and_truncate_utf8(str_buf, num_valid_characters, max_bytes, truncate_pos_index, max_char_2_valid);
if (num_byte_4_char)
*num_byte_4_char = truncate_pos_index;
return result;
}
/* ========================================================================
* Function : oscl_str_truncate_utf8
* Date : 10/25/2002
* Purpose : see oscl_string_utf8.h
* Modified :
* ========================================================================
*/
OSCL_EXPORT_REF int32 oscl_str_truncate_utf8(uint8 *str_buf, uint32 max_chars, uint32 max_bytes)
{
uint32 num_valid_characters, truncate_pos_index;
num_valid_characters = max_chars;
truncate_pos_index = 0;
oscl_str_is_valid_and_truncate_utf8(str_buf, num_valid_characters, max_bytes, truncate_pos_index);
if (max_bytes)
{
if (truncate_pos_index <= (max_bytes - 1))
*(str_buf + truncate_pos_index) = 0x00;
}
else
{
*(str_buf + truncate_pos_index) = 0x00;
}
return num_valid_characters;
}