ccutil/unichar.cpp - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        unichar.cpp
 // Description: Unicode character/ligature class.
 // Author:      Ray Smith
 // Created:     Wed Jun 28 17:05:01 PDT 2006
 //
 // (C) Copyright 2006, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #include "unichar.h"

 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF

 // Construct from a utf8 string. If len<0 then the string is null terminated.
 // If the string is too long to fit in the UNICHAR then it takes only what
 // will fit. Checks for illegal input and stops at an illegal sequence.
 // The resulting UNICHAR may be empty.
 UNICHAR::UNICHAR(const char* utf8_str, int len) {
   int total_len = 0;
   int step = 0;
   if (len < 0) {
     for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
   }
   for (total_len = 0; total_len < len; total_len += step) {
     step = utf8_step(utf8_str + total_len);
     if (total_len + step > UNICHAR_LEN)
       break;  // Too long.
     if (step == 0)
       break;  // Illegal first byte.
     int i;
     for (i = 1; i < step; ++i)
       if ((utf8_str[total_len + i] & 0xc0) != 0x80)
         break;
     if (i < step)
       break;  // Illegal surrogate
   }
   memcpy(chars, utf8_str, total_len);
   if (total_len < UNICHAR_LEN) {
     chars[UNICHAR_LEN - 1] = total_len;
     while (total_len < UNICHAR_LEN - 1)
       chars[total_len++] = 0;
   }
 }

 // Construct from a single UCS4 character. Illegal values are ignored,
 // resulting in an empty UNICHAR.
 UNICHAR::UNICHAR(int unicode) {
   const int bytemask = 0xBF;
   const int bytemark = 0x80;

   if (unicode < 0x80) {
     chars[UNICHAR_LEN - 1] = 1;
     chars[2] = 0;
     chars[1] = 0;
     chars[0] = static_cast<char>(unicode);
   } else if (unicode < 0x800) {
     chars[UNICHAR_LEN - 1] = 2;
     chars[2] = 0;
     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
     unicode >>= 6;
     chars[0] = static_cast<char>(unicode | 0xc0);
   } else if (unicode < 0x10000) {
     chars[UNICHAR_LEN - 1] = 3;
     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
     unicode >>= 6;
     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
     unicode >>= 6;
     chars[0] = static_cast<char>(unicode | 0xe0);
   } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
     chars[UNICHAR_LEN - 1] = 4;
     chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
     unicode >>= 6;
     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
     unicode >>= 6;
     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
     unicode >>= 6;
     chars[0] = static_cast<char>(unicode | 0xf0);
   } else {
     memset(chars, 0, UNICHAR_LEN);
   }
 }

 // Get the first character as UCS-4.
 int UNICHAR::first_uni() const {
   static const int utf8_offsets[5] = {
     0, 0, 0x3080, 0xE2080, 0x3C82080
   };
   int uni = 0;
   int len = utf8_step(chars);
   const char* src = chars;

   switch (len) {
   default:
     break;
   case 4:
     uni += static_cast<unsigned char>(*src++);
     uni <<= 6;
   case 3:
     uni += static_cast<unsigned char>(*src++);
     uni <<= 6;
   case 2:
     uni += static_cast<unsigned char>(*src++);
     uni <<= 6;
   case 1:
     uni += static_cast<unsigned char>(*src++);
   }
   uni -= utf8_offsets[len];
   return uni;
 }

 // Get a terminated UTF8 string: Must delete[] it after use.
 char* UNICHAR::utf8_str() const {
   int len = utf8_len();
   char* str = new char[len + 1];
   memcpy(str, chars, len);
   str[len] = 0;
   return str;
 }

 // Get the number of bytes in the first character of the given utf8 string.
 int UNICHAR::utf8_step(const char* utf8_str) {
   static const char utf8_bytes[256] = {
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
   };

   return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
 }
	///////////////////////////////////////////////////////////////////////
	// File: unichar.cpp
	// Description: Unicode character/ligature class.
	// Author: Ray Smith
	// Created: Wed Jun 28 17:05:01 PDT 2006
	//
	// (C) Copyright 2006, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#include "unichar.h"

	#define UNI_MAX_LEGAL_UTF32 0x0010FFFF

	// Construct from a utf8 string. If len<0 then the string is null terminated.
	// If the string is too long to fit in the UNICHAR then it takes only what
	// will fit. Checks for illegal input and stops at an illegal sequence.
	// The resulting UNICHAR may be empty.
	UNICHAR::UNICHAR(const char* utf8_str, int len) {
	int total_len = 0;
	int step = 0;
	if (len < 0) {
	for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
	}
	for (total_len = 0; total_len < len; total_len += step) {
	step = utf8_step(utf8_str + total_len);
	if (total_len + step > UNICHAR_LEN)
	break; // Too long.
	if (step == 0)
	break; // Illegal first byte.
	int i;
	for (i = 1; i < step; ++i)
	if ((utf8_str[total_len + i] & 0xc0) != 0x80)
	break;
	if (i < step)
	break; // Illegal surrogate
	}
	memcpy(chars, utf8_str, total_len);
	if (total_len < UNICHAR_LEN) {
	chars[UNICHAR_LEN - 1] = total_len;
	while (total_len < UNICHAR_LEN - 1)
	chars[total_len++] = 0;
	}
	}

	// Construct from a single UCS4 character. Illegal values are ignored,
	// resulting in an empty UNICHAR.
	UNICHAR::UNICHAR(int unicode) {
	const int bytemask = 0xBF;
	const int bytemark = 0x80;

	if (unicode < 0x80) {
	chars[UNICHAR_LEN - 1] = 1;
	chars[2] = 0;
	chars[1] = 0;
	chars[0] = static_cast<char>(unicode);
	} else if (unicode < 0x800) {
	chars[UNICHAR_LEN - 1] = 2;
	chars[2] = 0;
	chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);
	unicode >>= 6;
	chars[0] = static_cast<char>(unicode \| 0xc0);
	} else if (unicode < 0x10000) {
	chars[UNICHAR_LEN - 1] = 3;
	chars[2] = static_cast<char>((unicode \| bytemark) & bytemask);
	unicode >>= 6;
	chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);
	unicode >>= 6;
	chars[0] = static_cast<char>(unicode \| 0xe0);
	} else if (unicode <= UNI_MAX_LEGAL_UTF32) {
	chars[UNICHAR_LEN - 1] = 4;
	chars[3] = static_cast<char>((unicode \| bytemark) & bytemask);
	unicode >>= 6;
	chars[2] = static_cast<char>((unicode \| bytemark) & bytemask);
	unicode >>= 6;
	chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);
	unicode >>= 6;
	chars[0] = static_cast<char>(unicode \| 0xf0);
	} else {
	memset(chars, 0, UNICHAR_LEN);
	}
	}

	// Get the first character as UCS-4.
	int UNICHAR::first_uni() const {
	static const int utf8_offsets[5] = {
	0, 0, 0x3080, 0xE2080, 0x3C82080
	};
	int uni = 0;
	int len = utf8_step(chars);
	const char* src = chars;

	switch (len) {
	default:
	break;
	case 4:
	uni += static_cast<unsigned char>(*src++);
	uni <<= 6;
	case 3:
	uni += static_cast<unsigned char>(*src++);
	uni <<= 6;
	case 2:
	uni += static_cast<unsigned char>(*src++);
	uni <<= 6;
	case 1:
	uni += static_cast<unsigned char>(*src++);
	}
	uni -= utf8_offsets[len];
	return uni;
	}

	// Get a terminated UTF8 string: Must delete[] it after use.
	char* UNICHAR::utf8_str() const {
	int len = utf8_len();
	char* str = new char[len + 1];
	memcpy(str, chars, len);
	str[len] = 0;
	return str;
	}

	// Get the number of bytes in the first character of the given utf8 string.
	int UNICHAR::utf8_step(const char* utf8_str) {
	static const char utf8_bytes[256] = {
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
	};

	return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
	}