blob: 118d77d51cae78aee0d088ab9f55ddb21793dd53 [file] [log] [blame]
// =================================================================================================
// ADOBE SYSTEMS INCORPORATED
// Copyright 2006 Adobe Systems Incorporated
// All Rights Reserved
//
// NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
// of the Adobe license agreement accompanying it.
// =================================================================================================
package com.adobe.xmp.impl;
import java.io.UnsupportedEncodingException;
/**
* @since 12.10.2006
*/
public class Latin1Converter
{
/** */
private static final int STATE_START = 0;
/** */
private static final int STATE_UTF8CHAR = 11;
/**
* Private constructor
*/
private Latin1Converter()
{
// EMPTY
}
/**
* A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars.
* The result is a buffer where those chars have been converted to UTF-8;
* that means it contains only valid UTF-8 chars.
* <p>
* <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking
* at the first four bytes (that works only if the buffer starts with an ASCII-char,
* like xmls &apos;&lt;&apos;). UTF-16/32 flavours do not require further proccessing.
* <p>
* In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of
* Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte
* sequence.
* <p>
* The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code
* page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined
* by Windows 1252. These are in XML's RestrictedChar set, so we map them to a
* space.
* <p>
* The official Latin-1 characters in the range 0xA0..0xFF are converted into
* the Unicode Latin Supplement range U+00A0 - U+00FF.
* <p>
* <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC),
* it will be left as is. But if only the first two bytes are appearing,
* followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to
* 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a).
*
* @param buffer a byte buffer contain
* @return Returns a new buffer containing valid UTF-8
*/
public static ByteBuffer convert(ByteBuffer buffer)
{
if ("UTF-8".equals(buffer.getEncoding()))
{
// the buffer containing one UTF-8 char (up to 8 bytes)
byte[] readAheadBuffer = new byte[8];
// the number of bytes read ahead.
int readAhead = 0;
// expected UTF8 bytesto come
int expectedBytes = 0;
// output buffer with estimated length
ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3);
int state = STATE_START;
for (int i = 0; i < buffer.length(); i++)
{
int b = buffer.charAt(i);
switch (state)
{
default:
case STATE_START:
if (b < 0x7F)
{
out.append((byte) b);
}
else if (b >= 0xC0)
{
// start of UTF8 sequence
expectedBytes = -1;
int test = b;
for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1)
{
expectedBytes++;
}
readAheadBuffer[readAhead++] = (byte) b;
state = STATE_UTF8CHAR;
}
else // implicitly: b >= 0x80 && b < 0xC0
{
// invalid UTF8 start char, assume to be Latin-1
byte[] utf8 = convertToUTF8((byte) b);
out.append(utf8);
}
break;
case STATE_UTF8CHAR:
if (expectedBytes > 0 && (b & 0xC0) == 0x80)
{
// valid UTF8 char, add to readAheadBuffer
readAheadBuffer[readAhead++] = (byte) b;
expectedBytes--;
if (expectedBytes == 0)
{
out.append(readAheadBuffer, 0, readAhead);
readAhead = 0;
state = STATE_START;
}
}
else
{
// invalid UTF8 char:
// 1. convert first of seq to UTF8
byte[] utf8 = convertToUTF8(readAheadBuffer[0]);
out.append(utf8);
// 2. continue processing at second byte of sequence
i = i - readAhead;
readAhead = 0;
state = STATE_START;
}
break;
}
}
// loop ends with "half" Utf8 char --> assume that the bytes are Latin-1
if (state == STATE_UTF8CHAR)
{
for (int j = 0; j < readAhead; j++)
{
byte b = readAheadBuffer[j];
byte[] utf8 = convertToUTF8(b);
out.append(utf8);
}
}
return out;
}
else
{
// Latin-1 fixing applies only to UTF-8
return buffer;
}
}
/**
* Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a
* UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are
* formally undefined by Windows 1252 and therefore replaced by a space
* (0x20).
*
* @param ch
* an Cp1252 / Latin-1 byte
* @return Returns a byte array containing a UTF-8 byte sequence.
*/
private static byte[] convertToUTF8(byte ch)
{
int c = ch & 0xFF;
try
{
if (c >= 0x80)
{
if (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D)
{
return new byte[] { 0x20 }; // space for undefined
}
// interpret byte as Windows Cp1252 char
return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8");
}
}
catch (UnsupportedEncodingException e)
{
// EMPTY
}
return new byte[] { ch };
}
}