Windows-4.7.4/src/3rdparty/clucene/src/CLucene/config/utf8.cpp - platform/external/qt - Git at Google

 /*
  * Copyright (C) 1999 Tom Tromey
  * Copyright (C) 2000 Red Hat, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 02111-1307, USA.
  *
  *
  ************************************************
  * Also licensed with permission from Tom Tromey
  * and Owen Taylor under the Apache license.
  * Original location:
  * http://cvs.gnome.org/viewcvs/glib/glib/gutf8.c?rev=1.50&view=log
  ************************************************
  *
  * Copyright 2003-2006 The Apache Software Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
  #include "CLucene/StdHeader.h"

 typedef unsigned long  gunichar;
 typedef unsigned char  guchar;

 #define UTF8_COMPUTE(Char, Mask, Len)					      \
   if (Char < 128)							      \
     {									      \
       Len = 1;								      \
       Mask = 0x7f;							      \
     }									      \
   else if ((Char & 0xe0) == 0xc0)					      \
     {									      \
       Len = 2;								      \
       Mask = 0x1f;							      \
     }									      \
   else if ((Char & 0xf0) == 0xe0)					      \
     {									      \
       Len = 3;								      \
       Mask = 0x0f;							      \
     }									      \
   else if ((Char & 0xf8) == 0xf0)					      \
     {									      \
       Len = 4;								      \
       Mask = 0x07;							      \
     }									      \
   else if ((Char & 0xfc) == 0xf8)					      \
     {									      \
       Len = 5;								      \
       Mask = 0x03;							      \
     }									      \
   else if ((Char & 0xfe) == 0xfc)					      \
     {									      \
       Len = 6;								      \
       Mask = 0x01;							      \
     }									      \
   else									      \
     Len = -1;

 /*#define UTF8_LENGTH(Char)              \
   ((Char) < 0x80 ? 1 :                 \
    ((Char) < 0x800 ? 2 :               \
     ((Char) < 0x10000 ? 3 :            \
      ((Char) < 0x200000 ? 4 :          \
       ((Char) < 0x4000000 ? 5 : 6)))))*/


 #define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
   (Result) = (Chars)[0] & (Mask);					      \
   for ((Count) = 1; (Count) < (Len); ++(Count))				      \
     {									      \
       if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
 		{								      \
 			(Result) = -1;						      \
 			break;							      \
 		}								      \
       (Result) <<= 6;							      \
       (Result) |= ((Chars)[(Count)] & 0x3f);				      \
     }


 /**
  * lucene_wctoutf8:
  * @c: a ISO10646 character code
  * @outbuf: output buffer, must have at least 6 bytes of space.
  *       If %NULL, the length will be computed and returned
  *       and nothing will be written to @outbuf.
  *
  * Converts a single character to UTF-8.
  *
  * Return value: number of bytes written
  **/
 size_t	lucene_wctoutf8(char * outbuf, const wchar_t ch)
 {
   gunichar c = ch;
   guchar len = 0;
   int first;
   int i;

   if (c < 0x80)
     {
       first = 0;
       len = 1;
     }
   else if (c < 0x800)
     {
       first = 0xc0;
       len = 2;
     }
   else if (c < 0x10000)
     {
       first = 0xe0;
       len = 3;
     }
    else if (c < 0x200000)
     {
       first = 0xf0;
       len = 4;
     }
   else if (c < 0x4000000)
     {
       first = 0xf8;
       len = 5;
     }
   else
     {
       first = 0xfc;
       len = 6;
     }

   if (outbuf)
   {
 	for (i = len - 1; i > 0; --i)
 	{
 		outbuf[i] = (char)((c & 0x3f) | 0x80);
 		c >>= 6;
 	}
 	outbuf[0] = c | first;
   }

   return len;
 }


 /**
  * lucene_utf8towc:
  * @p: a pointer to Unicode character encoded as UTF-8
  *
  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
  * If @p does not point to a valid UTF-8 encoded character, results are
  * undefined. If you are not sure that the bytes are complete
  * valid Unicode characters, you should use lucene_utf8towc_validated()
  * instead.
  *
  * Return value: the resulting character
  **/
 size_t lucene_utf8towc(wchar_t *pwc, const char *p, size_t n)
 {
   int i, mask = 0;
   int result;
   unsigned char c = (unsigned char) *p;
   int len=0;

   UTF8_COMPUTE (c, mask, len);
   if (len == -1)
     return 0;
   UTF8_GET (result, p, i, mask, len);

   *pwc = result;
   return len;
 }


 //this function was not taken from gnome
 size_t lucene_wcstoutf8(char * result, const wchar_t * str, size_t result_length){
   char *p=result;
   int i = 0;

   while (p < result + result_length-1 && str[i] != 0)
     p += lucene_wctoutf8(p,str[i++]);

   *p = '\0';

   return p-result;
 }
 //this function was not taken from gnome
 size_t lucene_utf8towcs(wchar_t * result, const char * str, size_t result_length){
   char *sp = (char*)str;
   wchar_t *rp = result;
   int i = 0;

   while (rp < result + result_length && *sp!=0){
     size_t r = lucene_utf8towc(rp,sp,6);
 	if ( r == -1 )
 		return 0;
 	sp += r;
 	rp++;
   }

   if ( sp-str < result_length )
 	*rp = '\0';

   size_t ret = sp-str;
   return ret;
 }
 //get the number of bytes that make up the utf8 character.
 //this function was not taken from gnome
 size_t lucene_utf8charlen(const char *p)
 {
   int mask = 0;
   int len=0;
   unsigned char c = (unsigned char) *p;

   UTF8_COMPUTE (c, mask, len);
   return len;
 }
	/*
	* Copyright (C) 1999 Tom Tromey
	* Copyright (C) 2000 Red Hat, Inc.
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Lesser General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Lesser General Public License for more details.
	*
	* You should have received a copy of the GNU Lesser General Public
	* License along with this library; if not, write to the
	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	* Boston, MA 02111-1307, USA.
	*
	*
	************************************************
	* Also licensed with permission from Tom Tromey
	* and Owen Taylor under the Apache license.
	* Original location:
	* http://cvs.gnome.org/viewcvs/glib/glib/gutf8.c?rev=1.50&view=log
	************************************************
	*
	* Copyright 2003-2006 The Apache Software Foundation
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	#include "CLucene/StdHeader.h"

	typedef unsigned long gunichar;
	typedef unsigned char guchar;

	#define UTF8_COMPUTE(Char, Mask, Len) \
	if (Char < 128) \
	{ \
	Len = 1; \
	Mask = 0x7f; \
	} \
	else if ((Char & 0xe0) == 0xc0) \
	{ \
	Len = 2; \
	Mask = 0x1f; \
	} \
	else if ((Char & 0xf0) == 0xe0) \
	{ \
	Len = 3; \
	Mask = 0x0f; \
	} \
	else if ((Char & 0xf8) == 0xf0) \
	{ \
	Len = 4; \
	Mask = 0x07; \
	} \
	else if ((Char & 0xfc) == 0xf8) \
	{ \
	Len = 5; \
	Mask = 0x03; \
	} \
	else if ((Char & 0xfe) == 0xfc) \
	{ \
	Len = 6; \
	Mask = 0x01; \
	} \
	else \
	Len = -1;

	/*#define UTF8_LENGTH(Char) \
	((Char) < 0x80 ? 1 : \
	((Char) < 0x800 ? 2 : \
	((Char) < 0x10000 ? 3 : \
	((Char) < 0x200000 ? 4 : \
	((Char) < 0x4000000 ? 5 : 6)))))*/


	#define UTF8_GET(Result, Chars, Count, Mask, Len) \
	(Result) = (Chars)[0] & (Mask); \
	for ((Count) = 1; (Count) < (Len); ++(Count)) \
	{ \
	if (((Chars)[(Count)] & 0xc0) != 0x80) \
	{ \
	(Result) = -1; \
	break; \
	} \
	(Result) <<= 6; \
	(Result) \|= ((Chars)[(Count)] & 0x3f); \
	}


	/**
	* lucene_wctoutf8:
	* @c: a ISO10646 character code
	* @outbuf: output buffer, must have at least 6 bytes of space.
	* If %NULL, the length will be computed and returned
	* and nothing will be written to @outbuf.
	*
	* Converts a single character to UTF-8.
	*
	* Return value: number of bytes written
	**/
	size_t lucene_wctoutf8(char * outbuf, const wchar_t ch)
	{
	gunichar c = ch;
	guchar len = 0;
	int first;
	int i;

	if (c < 0x80)
	{
	first = 0;
	len = 1;
	}
	else if (c < 0x800)
	{
	first = 0xc0;
	len = 2;
	}
	else if (c < 0x10000)
	{
	first = 0xe0;
	len = 3;
	}
	else if (c < 0x200000)
	{
	first = 0xf0;
	len = 4;
	}
	else if (c < 0x4000000)
	{
	first = 0xf8;
	len = 5;
	}
	else
	{
	first = 0xfc;
	len = 6;
	}

	if (outbuf)
	{
	for (i = len - 1; i > 0; --i)
	{
	outbuf[i] = (char)((c & 0x3f) \| 0x80);
	c >>= 6;
	}
	outbuf[0] = c \| first;
	}

	return len;
	}


	/**
	* lucene_utf8towc:
	* @p: a pointer to Unicode character encoded as UTF-8
	*
	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
	* If @p does not point to a valid UTF-8 encoded character, results are
	* undefined. If you are not sure that the bytes are complete
	* valid Unicode characters, you should use lucene_utf8towc_validated()
	* instead.
	*
	* Return value: the resulting character
	**/
	size_t lucene_utf8towc(wchar_t pwc, const char p, size_t n)
	{
	int i, mask = 0;
	int result;
	unsigned char c = (unsigned char) *p;
	int len=0;

	UTF8_COMPUTE (c, mask, len);
	if (len == -1)
	return 0;
	UTF8_GET (result, p, i, mask, len);

	*pwc = result;
	return len;
	}


	//this function was not taken from gnome
	size_t lucene_wcstoutf8(char * result, const wchar_t * str, size_t result_length){
	char *p=result;
	int i = 0;

	while (p < result + result_length-1 && str[i] != 0)
	p += lucene_wctoutf8(p,str[i++]);

	*p = '\0';

	return p-result;
	}
	//this function was not taken from gnome
	size_t lucene_utf8towcs(wchar_t * result, const char * str, size_t result_length){
	char sp = (char)str;
	wchar_t *rp = result;
	int i = 0;

	while (rp < result + result_length && *sp!=0){
	size_t r = lucene_utf8towc(rp,sp,6);
	if ( r == -1 )
	return 0;
	sp += r;
	rp++;
	}

	if ( sp-str < result_length )
	*rp = '\0';

	size_t ret = sp-str;
	return ret;
	}
	//get the number of bytes that make up the utf8 character.
	//this function was not taken from gnome
	size_t lucene_utf8charlen(const char *p)
	{
	int mask = 0;
	int len=0;
	unsigned char c = (unsigned char) *p;

	UTF8_COMPUTE (c, mask, len);
	return len;
	}