blob: ffe403cd8fc1179291d69d7261d19480f9ed6ae5 [file] [log] [blame]
//
// ========================================================================
// Copyright (c) 1995-2014 Mort Bay Consulting Pty. Ltd.
// ------------------------------------------------------------------------
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// and Apache License v2.0 which accompanies this distribution.
//
// The Eclipse Public License is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// The Apache License v2.0 is available at
// http://www.opensource.org/licenses/apache2.0.php
//
// You may elect to redistribute this code under either of these licenses.
// ========================================================================
//
package org.eclipse.jetty.util;
import java.io.IOException;
import org.eclipse.jetty.util.log.Log;
import org.eclipse.jetty.util.log.Logger;
/* ------------------------------------------------------------ */
/**
* Utf8 Appendable abstract base class
*
* This abstract class wraps a standard {@link java.lang.Appendable} and provides methods to append UTF-8 encoded bytes, that are converted into characters.
*
* This class is stateful and up to 4 calls to {@link #append(byte)} may be needed before state a character is appended to the string buffer.
*
* The UTF-8 decoding is done by this class and no additional buffers or Readers are used. The UTF-8 code was inspired by
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*
* License information for Bjoern Hoehrmann's code:
*
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
* FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
* IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**/
public abstract class Utf8Appendable
{
protected static final Logger LOG = Log.getLogger(Utf8Appendable.class);
public static final char REPLACEMENT = '\ufffd';
private static final int UTF8_ACCEPT = 0;
private static final int UTF8_REJECT = 12;
protected final Appendable _appendable;
protected int _state = UTF8_ACCEPT;
private static final byte[] BYTE_TABLE =
{
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8
};
private static final byte[] TRANS_TABLE =
{
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12
};
private int _codep;
public Utf8Appendable(Appendable appendable)
{
_appendable = appendable;
}
public abstract int length();
protected void reset()
{
_state = UTF8_ACCEPT;
}
public void append(byte b)
{
try
{
appendByte(b);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
public void append(byte[] b, int offset, int length)
{
try
{
int end = offset + length;
for (int i = offset; i < end; i++)
appendByte(b[i]);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
public boolean append(byte[] b, int offset, int length, int maxChars)
{
try
{
int end = offset + length;
for (int i = offset; i < end; i++)
{
if (length() > maxChars)
return false;
appendByte(b[i]);
}
return true;
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
protected void appendByte(byte b) throws IOException
{
if (b > 0 && _state == UTF8_ACCEPT)
{
_appendable.append((char)(b & 0xFF));
}
else
{
int i = b & 0xFF;
int type = BYTE_TABLE[i];
_codep = _state == UTF8_ACCEPT ? (0xFF >> type) & i : (i & 0x3F) | (_codep << 6);
int next = TRANS_TABLE[_state + type];
switch(next)
{
case UTF8_ACCEPT:
_state=next;
if (_codep < Character.MIN_HIGH_SURROGATE)
{
_appendable.append((char)_codep);
}
else
{
for (char c : Character.toChars(_codep))
_appendable.append(c);
}
break;
case UTF8_REJECT:
String reason = "byte "+TypeUtil.toHexString(b)+" in state "+(_state/12);
_codep=0;
_state = UTF8_ACCEPT;
_appendable.append(REPLACEMENT);
throw new NotUtf8Exception(reason);
default:
_state=next;
}
}
}
public boolean isUtf8SequenceComplete()
{
return _state == UTF8_ACCEPT;
}
public static class NotUtf8Exception extends IllegalArgumentException
{
public NotUtf8Exception(String reason)
{
super("Not valid UTF8! "+reason);
}
}
protected void checkState()
{
if (!isUtf8SequenceComplete())
{
_codep=0;
_state = UTF8_ACCEPT;
try
{
_appendable.append(REPLACEMENT);
}
catch(IOException e)
{
throw new RuntimeException(e);
}
throw new NotUtf8Exception("incomplete UTF8 sequence");
}
}
public String toReplacedString()
{
if (!isUtf8SequenceComplete())
{
_codep=0;
_state = UTF8_ACCEPT;
try
{
_appendable.append(REPLACEMENT);
}
catch(IOException e)
{
throw new RuntimeException(e);
}
Throwable th= new NotUtf8Exception("incomplete UTF8 sequence");
LOG.warn(th.toString());
LOG.debug(th);
}
return _appendable.toString();
}
}