blob: a43894a70c8f7df5544b279c5584e6562e7378aa [file] [log] [blame]
/*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*
*
************************************************
* Also licensed with permission from Tom Tromey
* and Owen Taylor under the Apache license.
* Original location:
* http://cvs.gnome.org/viewcvs/glib/glib/gutf8.c?rev=1.50&view=log
************************************************
*
* Copyright 2003-2006 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "CLucene/StdHeader.h"
typedef unsigned long gunichar;
typedef unsigned char guchar;
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Len = 1; \
Mask = 0x7f; \
} \
else if ((Char & 0xe0) == 0xc0) \
{ \
Len = 2; \
Mask = 0x1f; \
} \
else if ((Char & 0xf0) == 0xe0) \
{ \
Len = 3; \
Mask = 0x0f; \
} \
else if ((Char & 0xf8) == 0xf0) \
{ \
Len = 4; \
Mask = 0x07; \
} \
else if ((Char & 0xfc) == 0xf8) \
{ \
Len = 5; \
Mask = 0x03; \
} \
else if ((Char & 0xfe) == 0xfc) \
{ \
Len = 6; \
Mask = 0x01; \
} \
else \
Len = -1;
/*#define UTF8_LENGTH(Char) \
((Char) < 0x80 ? 1 : \
((Char) < 0x800 ? 2 : \
((Char) < 0x10000 ? 3 : \
((Char) < 0x200000 ? 4 : \
((Char) < 0x4000000 ? 5 : 6)))))*/
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
(Result) = (Chars)[0] & (Mask); \
for ((Count) = 1; (Count) < (Len); ++(Count)) \
{ \
if (((Chars)[(Count)] & 0xc0) != 0x80) \
{ \
(Result) = -1; \
break; \
} \
(Result) <<= 6; \
(Result) |= ((Chars)[(Count)] & 0x3f); \
}
/**
* lucene_wctoutf8:
* @c: a ISO10646 character code
* @outbuf: output buffer, must have at least 6 bytes of space.
* If %NULL, the length will be computed and returned
* and nothing will be written to @outbuf.
*
* Converts a single character to UTF-8.
*
* Return value: number of bytes written
**/
size_t lucene_wctoutf8(char * outbuf, const wchar_t ch)
{
gunichar c = ch;
guchar len = 0;
int first;
int i;
if (c < 0x80)
{
first = 0;
len = 1;
}
else if (c < 0x800)
{
first = 0xc0;
len = 2;
}
else if (c < 0x10000)
{
first = 0xe0;
len = 3;
}
else if (c < 0x200000)
{
first = 0xf0;
len = 4;
}
else if (c < 0x4000000)
{
first = 0xf8;
len = 5;
}
else
{
first = 0xfc;
len = 6;
}
if (outbuf)
{
for (i = len - 1; i > 0; --i)
{
outbuf[i] = (char)((c & 0x3f) | 0x80);
c >>= 6;
}
outbuf[0] = c | first;
}
return len;
}
/**
* lucene_utf8towc:
* @p: a pointer to Unicode character encoded as UTF-8
*
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
* If @p does not point to a valid UTF-8 encoded character, results are
* undefined. If you are not sure that the bytes are complete
* valid Unicode characters, you should use lucene_utf8towc_validated()
* instead.
*
* Return value: the resulting character
**/
size_t lucene_utf8towc(wchar_t *pwc, const char *p, size_t n)
{
int i, mask = 0;
int result;
unsigned char c = (unsigned char) *p;
int len=0;
UTF8_COMPUTE (c, mask, len);
if (len == -1)
return 0;
UTF8_GET (result, p, i, mask, len);
*pwc = result;
return len;
}
//this function was not taken from gnome
size_t lucene_wcstoutf8(char * result, const wchar_t * str, size_t result_length){
char *p=result;
int i = 0;
while (p < result + result_length-1 && str[i] != 0)
p += lucene_wctoutf8(p,str[i++]);
*p = '\0';
return p-result;
}
//this function was not taken from gnome
size_t lucene_utf8towcs(wchar_t * result, const char * str, size_t result_length){
char *sp = (char*)str;
wchar_t *rp = result;
int i = 0;
while (rp < result + result_length && *sp!=0){
size_t r = lucene_utf8towc(rp,sp,6);
if ( r == -1 )
return 0;
sp += r;
rp++;
}
if ( sp-str < result_length )
*rp = '\0';
size_t ret = sp-str;
return ret;
}
//get the number of bytes that make up the utf8 character.
//this function was not taken from gnome
size_t lucene_utf8charlen(const char *p)
{
int mask = 0;
int len=0;
unsigned char c = (unsigned char) *p;
UTF8_COMPUTE (c, mask, len);
return len;
}