| // ================================================================================================= |
| // ADOBE SYSTEMS INCORPORATED |
| // Copyright 2006 Adobe Systems Incorporated |
| // All Rights Reserved |
| // |
| // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms |
| // of the Adobe license agreement accompanying it. |
| // ================================================================================================= |
| |
| |
| |
| package com.adobe.xmp.impl; |
| |
| import java.io.UnsupportedEncodingException; |
| |
| |
| /** |
| * @since 12.10.2006 |
| */ |
| public class Latin1Converter |
| { |
| /** */ |
| private static final int STATE_START = 0; |
| /** */ |
| private static final int STATE_UTF8CHAR = 11; |
| |
| |
| /** |
| * Private constructor |
| */ |
| private Latin1Converter() |
| { |
| // EMPTY |
| } |
| |
| |
| /** |
| * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars. |
| * The result is a buffer where those chars have been converted to UTF-8; |
| * that means it contains only valid UTF-8 chars. |
| * <p> |
| * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking |
| * at the first four bytes (that works only if the buffer starts with an ASCII-char, |
| * like xmls '<'). UTF-16/32 flavours do not require further proccessing. |
| * <p> |
| * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of |
| * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte |
| * sequence. |
| * <p> |
| * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code |
| * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined |
| * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a |
| * space. |
| * <p> |
| * The official Latin-1 characters in the range 0xA0..0xFF are converted into |
| * the Unicode Latin Supplement range U+00A0 - U+00FF. |
| * <p> |
| * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC), |
| * it will be left as is. But if only the first two bytes are appearing, |
| * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to |
| * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a). |
| * |
| * @param buffer a byte buffer contain |
| * @return Returns a new buffer containing valid UTF-8 |
| */ |
| public static ByteBuffer convert(ByteBuffer buffer) |
| { |
| if ("UTF-8".equals(buffer.getEncoding())) |
| { |
| // the buffer containing one UTF-8 char (up to 8 bytes) |
| byte[] readAheadBuffer = new byte[8]; |
| // the number of bytes read ahead. |
| int readAhead = 0; |
| // expected UTF8 bytesto come |
| int expectedBytes = 0; |
| // output buffer with estimated length |
| ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3); |
| |
| int state = STATE_START; |
| for (int i = 0; i < buffer.length(); i++) |
| { |
| int b = buffer.charAt(i); |
| |
| switch (state) |
| { |
| default: |
| case STATE_START: |
| if (b < 0x7F) |
| { |
| out.append((byte) b); |
| } |
| else if (b >= 0xC0) |
| { |
| // start of UTF8 sequence |
| expectedBytes = -1; |
| int test = b; |
| for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1) |
| { |
| expectedBytes++; |
| } |
| readAheadBuffer[readAhead++] = (byte) b; |
| state = STATE_UTF8CHAR; |
| } |
| else // implicitly: b >= 0x80 && b < 0xC0 |
| { |
| // invalid UTF8 start char, assume to be Latin-1 |
| byte[] utf8 = convertToUTF8((byte) b); |
| out.append(utf8); |
| } |
| break; |
| |
| case STATE_UTF8CHAR: |
| if (expectedBytes > 0 && (b & 0xC0) == 0x80) |
| { |
| // valid UTF8 char, add to readAheadBuffer |
| readAheadBuffer[readAhead++] = (byte) b; |
| expectedBytes--; |
| |
| if (expectedBytes == 0) |
| { |
| out.append(readAheadBuffer, 0, readAhead); |
| readAhead = 0; |
| |
| state = STATE_START; |
| } |
| } |
| else |
| { |
| // invalid UTF8 char: |
| // 1. convert first of seq to UTF8 |
| byte[] utf8 = convertToUTF8(readAheadBuffer[0]); |
| out.append(utf8); |
| |
| // 2. continue processing at second byte of sequence |
| i = i - readAhead; |
| readAhead = 0; |
| |
| state = STATE_START; |
| } |
| break; |
| } |
| } |
| |
| // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1 |
| if (state == STATE_UTF8CHAR) |
| { |
| for (int j = 0; j < readAhead; j++) |
| { |
| byte b = readAheadBuffer[j]; |
| byte[] utf8 = convertToUTF8(b); |
| out.append(utf8); |
| } |
| } |
| |
| return out; |
| } |
| else |
| { |
| // Latin-1 fixing applies only to UTF-8 |
| return buffer; |
| } |
| } |
| |
| |
| /** |
| * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a |
| * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are |
| * formally undefined by Windows 1252 and therefore replaced by a space |
| * (0x20). |
| * |
| * @param ch |
| * an Cp1252 / Latin-1 byte |
| * @return Returns a byte array containing a UTF-8 byte sequence. |
| */ |
| private static byte[] convertToUTF8(byte ch) |
| { |
| int c = ch & 0xFF; |
| try |
| { |
| if (c >= 0x80) |
| { |
| if (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) |
| { |
| return new byte[] { 0x20 }; // space for undefined |
| } |
| |
| // interpret byte as Windows Cp1252 char |
| return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8"); |
| } |
| } |
| catch (UnsupportedEncodingException e) |
| { |
| // EMPTY |
| } |
| return new byte[] { ch }; |
| } |
| } |