blob: edac49745be61394b5fd1d0efaa7238ff1082f95 [file] [log] [blame]
/*
* Copyright 2021 ZXing authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.zxing.datamatrix.encoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import com.google.zxing.common.MinimalECIInput;
/**
* Encoder that encodes minimally
*
* Algorithm:
*
* Uses Dijkstra to produce mathematically minimal encodings that are in some cases smaller than the results produced
* by the algorithm described in annex S in the specification ISO/IEC 16022:200(E). The biggest improvment of this
* algorithm over that one is the case when the algorithm enters the most inefficient mode, the B256 mode. The
* algorithm from the specification algorithm will exit this mode only if it encounters digits so that arbitrarily
* inefficient results can be produced if the postfix contains no digits.
*
* Multi ECI support and ECI switching:
*
* For multi language content the algorithm selects the most compact representation using ECI modes. Note that unlike
* the compaction algorithm used for QR-Codes, this implementation operates in two stages and therfore is not
* mathematically optimal. In the first stage, the input string is encoded minimally as a stream of ECI character set
* selectors and bytes encoded in the selected encoding. In this stage the algorithm might for example decide to
* encode ocurrences of the characters "\u0150\u015C" (O-double-acute, S-circumflex) in UTF-8 by a single ECI or
* alternatively by multiple ECIs that switch between IS0-8859-2 and ISO-8859-3 (e.g. in the case that the input
* contains many * characters from ISO-8859-2 (Latin 2) and few from ISO-8859-3 (Latin 3)).
* In a second stage this stream of ECIs and bytes is minimally encoded using the various Data Matrix encoding modes.
* While both stages encode mathematically minimally it is not ensured that the result is mathematically minimal since
* the size growth for inserting an ECI in the first stage can only be approximated as the first stage does not know
* in which mode the ECI will occur in the second stage (may, or may not require an extra latch to ASCII depending on
* the current mode). The reason for this shortcoming are difficulties in implementing it in a straightforward and
* readable manner.
*
* GS1 support
*
* FNC1 delimiters can be encoded in the input string by using the FNC1 character specified in the encoding function.
* When a FNC1 character is specified then a leading FNC1 will be encoded and all ocurrences of delimiter characters
* while result in FNC1 codewords in the symbol.
*
* @author Alex Geller
*/
public final class MinimalEncoder {
enum Mode {
ASCII,
C40,
TEXT,
X12,
EDF,
B256
}
static final char[] C40_SHIFT2_CHARS = {'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_' };
private MinimalEncoder() {
}
static boolean isExtendedASCII(char ch, int fnc1) {
return ch != fnc1 && ch >= 128 && ch <= 255;
}
private static boolean isInC40Shift1Set(char ch) {
return ch <= 31;
}
private static boolean isInC40Shift2Set(char ch, int fnc1) {
for (char c40Shift2Char : C40_SHIFT2_CHARS) {
if (c40Shift2Char == ch) {
return true;
}
}
return ch == fnc1;
}
private static boolean isInTextShift1Set(char ch) {
return isInC40Shift1Set(ch);
}
private static boolean isInTextShift2Set(char ch, int fnc1) {
return isInC40Shift2Set(ch, fnc1);
}
/**
* Performs message encoding of a DataMatrix message
*
* @param msg the message
* @return the encoded message (the char values range from 0 to 255)
*/
public static String encodeHighLevel(String msg) {
return encodeHighLevel(msg, null, -1, SymbolShapeHint.FORCE_NONE);
}
/**
* Performs message encoding of a DataMatrix message
*
* @param msg the message
* @param priorityCharset The preferred {@link Charset}. When the value of the argument is null, the algorithm
* chooses charsets that leads to a minimal representation. Otherwise the algorithm will use the priority
* charset to encode any character in the input that can be encoded by it if the charset is among the
* supported charsets.
* @param fnc1 denotes the character in the input that represents the FNC1 character or -1 if this is not a GS1
* bar code. If the value is not -1 then a FNC1 is also prepended.
* @param shape requested shape.
* @return the encoded message (the char values range from 0 to 255)
*/
public static String encodeHighLevel(String msg, Charset priorityCharset, int fnc1, SymbolShapeHint shape) {
int macroId = 0;
if (msg.startsWith(HighLevelEncoder.MACRO_05_HEADER) && msg.endsWith(HighLevelEncoder.MACRO_TRAILER)) {
macroId = 5;
msg = msg.substring(HighLevelEncoder.MACRO_05_HEADER.length(), msg.length() - 2);
} else if (msg.startsWith(HighLevelEncoder.MACRO_06_HEADER) && msg.endsWith(HighLevelEncoder.MACRO_TRAILER)) {
macroId = 6;
msg = msg.substring(HighLevelEncoder.MACRO_06_HEADER.length(), msg.length() - 2);
}
return new String(encode(msg, priorityCharset, fnc1, shape, macroId), StandardCharsets.ISO_8859_1);
}
/**
* Encodes input minimally and returns an array of the codewords
*
* @param input The string to encode
* @param priorityCharset The preferred {@link Charset}. When the value of the argument is null, the algorithm
* chooses charsets that leads to a minimal representation. Otherwise the algorithm will use the priority
* charset to encode any character in the input that can be encoded by it if the charset is among the
* supported charsets.
* @param fnc1 denotes the character in the input that represents the FNC1 character or -1 if this is not a GS1
* bar code. If the value is not -1 then a FNC1 is also prepended.
* @param shape requested shape.
* @param macroId Prepends the specified macro function in case that a value of 5 or 6 is specified.
* @return An array of bytes representing the codewords of a minimal encoding.
*/
static byte[] encode(String input, Charset priorityCharset, int fnc1, SymbolShapeHint shape, int macroId) {
return encodeMinimally(new Input(input, priorityCharset, fnc1, shape, macroId)).getBytes();
}
static void addEdge(Edge[][] edges, Edge edge) {
int vertexIndex = edge.fromPosition + edge.characterLength;
if (edges[vertexIndex][edge.getEndMode().ordinal()] == null ||
edges[vertexIndex][edge.getEndMode().ordinal()].cachedTotalSize > edge.cachedTotalSize) {
edges[vertexIndex][edge.getEndMode().ordinal()] = edge;
}
}
/** @return the number of words in which the string starting at from can be encoded in c40 or text mode.
* The number of characters encoded is returned in characterLength.
* The number of characters encoded is also minimal in the sense that the algorithm stops as soon
* as a character encoding fills a C40 word competely (three C40 values). An exception is at the
* end of the string where two C40 values are allowed (according to the spec the third c40 value
* is filled with 0 (Shift 1) in this case).
*/
static int getNumberOfC40Words(Input input, int from, boolean c40,int[] characterLength) {
int thirdsCount = 0;
for (int i = from; i < input.length(); i++) {
if (input.isECI(i)) {
characterLength[0] = 0;
return 0;
}
char ci = input.charAt(i);
if (c40 && HighLevelEncoder.isNativeC40(ci) || !c40 && HighLevelEncoder.isNativeText(ci)) {
thirdsCount++; //native
} else if (!isExtendedASCII(ci, input.getFNC1Character())) {
thirdsCount += 2; //shift
} else {
int asciiValue = ci & 0xff;
if (asciiValue >= 128 && (c40 && HighLevelEncoder.isNativeC40((char) (asciiValue - 128)) ||
!c40 && HighLevelEncoder.isNativeText((char) (asciiValue - 128)))) {
thirdsCount += 3; // shift, Upper shift
} else {
thirdsCount += 4; // shift, Upper shift, shift
}
}
if (thirdsCount % 3 == 0 || ((thirdsCount - 2) % 3 == 0 && i + 1 == input.length())) {
characterLength[0] = i - from + 1;
return (int) Math.ceil(((double) thirdsCount) / 3.0);
}
}
characterLength[0] = 0;
return 0;
}
static void addEdges(Input input, Edge[][] edges, int from, Edge previous) {
if (input.isECI(from)) {
addEdge(edges, new Edge(input, Mode.ASCII, from, 1, previous));
return;
}
char ch = input.charAt(from);
if (previous == null || previous.getEndMode() != Mode.EDF) { //not possible to unlatch a full EDF edge to something
//else
if (HighLevelEncoder.isDigit(ch) && input.haveNCharacters(from, 2) &&
HighLevelEncoder.isDigit(input.charAt(from + 1))) {
// two digits ASCII encoded
addEdge(edges, new Edge(input, Mode.ASCII, from, 2, previous));
} else {
// one ASCII encoded character or an extended character via Upper Shift
addEdge(edges, new Edge(input, Mode.ASCII, from, 1, previous));
}
Mode[] modes = {Mode.C40, Mode.TEXT};
for (Mode mode : modes) {
int[] characterLength = new int[1];
if (getNumberOfC40Words(input, from, mode == Mode.C40, characterLength) > 0) {
addEdge(edges, new Edge(input, mode, from, characterLength[0], previous));
}
}
if (input.haveNCharacters(from,3) &&
HighLevelEncoder.isNativeX12(input.charAt(from)) &&
HighLevelEncoder.isNativeX12(input.charAt(from + 1)) &&
HighLevelEncoder.isNativeX12(input.charAt(from + 2))) {
addEdge(edges, new Edge(input, Mode.X12, from, 3, previous));
}
addEdge(edges, new Edge(input, Mode.B256, from, 1, previous));
}
//We create 4 EDF edges, with 1, 2 3 or 4 characters length. The fourth normally doesn't have a latch to ASCII
//unless it is 2 characters away from the end of the input.
int i;
for (i = 0; i < 3; i++) {
int pos = from + i;
if (input.haveNCharacters(pos,1) && HighLevelEncoder.isNativeEDIFACT(input.charAt(pos))) {
addEdge(edges, new Edge(input, Mode.EDF, from, i + 1, previous));
} else {
break;
}
}
if (i == 3 && input.haveNCharacters(from, 4) && HighLevelEncoder.isNativeEDIFACT(input.charAt(from + 3))) {
addEdge(edges, new Edge(input, Mode.EDF, from, 4, previous));
}
}
static Result encodeMinimally(Input input) {
@SuppressWarnings("checkstyle:lineLength")
/* The minimal encoding is computed by Dijkstra. The acyclic graph is modeled as follows:
* A vertex represents a combination of a position in the input and an encoding mode where position 0
* denotes the position left of the first character, 1 the position left of the second character and so on.
* Likewise the end vertices are located after the last character at position input.length().
* For any position there might be up to six vertices, one for each of the encoding types ASCII, C40, TEXT, X12,
* EDF and B256.
*
* As an example consider the input string "ABC123" then at position 0 there is only one vertex with the default
* ASCII encodation. At position 3 there might be vertices for the types ASCII, C40, X12, EDF and B256.
*
* An edge leading to such a vertex encodes one or more of the characters left of the position that the vertex
* represents. It encodes the characters in the encoding mode of the vertex that it ends on. In other words,
* all edges leading to a particular vertex encode the same characters (the length of the suffix can vary) using the same
* encoding mode.
* As an example consider the input string "ABC123" and the vertex (4,EDF). Possible edges leading to this vertex
* are:
* (0,ASCII) --EDF(ABC1)--> (4,EDF)
* (1,ASCII) --EDF(BC1)--> (4,EDF)
* (1,B256) --EDF(BC1)--> (4,EDF)
* (1,EDF) --EDF(BC1)--> (4,EDF)
* (2,ASCII) --EDF(C1)--> (4,EDF)
* (2,B256) --EDF(C1)--> (4,EDF)
* (2,EDF) --EDF(C1)--> (4,EDF)
* (3,ASCII) --EDF(1)--> (4,EDF)
* (3,B256) --EDF(1)--> (4,EDF)
* (3,EDF) --EDF(1)--> (4,EDF)
* (3,C40) --EDF(1)--> (4,EDF)
* (3,X12) --EDF(1)--> (4,EDF)
*
* The edges leading to a vertex are stored in such a way that there is a fast way to enumerate the edges ending
* on a particular vertex.
*
* The algorithm processes the vertices in order of their position thereby performing the following:
*
* For every vertex at position i the algorithm enumerates the edges ending on the vertex and removes all but the
* shortest from that list.
* Then it processes the vertices for the position i+1. If i+1 == input.length() then the algorithm ends
* and chooses the the edge with the smallest size from any of the edges leading to vertices at this position.
* Otherwise the algorithm computes all possible outgoing edges for the vertices at the position i+1
*
* Examples:
* The process is illustrated by showing the graph (edges) after each iteration from left to right over the input:
* An edge is drawn as follows "(" + fromVertex + ") -- " + encodingMode + "(" + encodedInput + ") (" +
* accumulatedSize + ") --> (" + toVertex + ")"
*
* Example 1 encoding the string "ABCDEFG":
*
*
* Situation after adding edges to the start vertex (0,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256)
* (0,ASCII) EDF(AB) (4) --> (2,EDF)
* (0,ASCII) C40(ABC) (3) --> (3,C40)
* (0,ASCII) TEXT(ABC) (5) --> (3,TEXT)
* (0,ASCII) X12(ABC) (3) --> (3,X12)
* (0,ASCII) EDF(ABC) (4) --> (3,EDF)
* (0,ASCII) EDF(ABCD) (4) --> (4,EDF)
*
* Situation after adding edges to vertices at position 1
* (0,ASCII) ASCII(A) (1) --> (1,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256)
* (0,ASCII) EDF(AB) (4) --> (2,EDF)
* (0,ASCII) C40(ABC) (3) --> (3,C40)
* (0,ASCII) TEXT(ABC) (5) --> (3,TEXT)
* (0,ASCII) X12(ABC) (3) --> (3,X12)
* (0,ASCII) EDF(ABC) (4) --> (3,EDF)
* (0,ASCII) EDF(ABCD) (4) --> (4,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) B256(B) (4) --> (2,B256)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BC) (5) --> (3,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) C40(BCD) (4) --> (4,C40)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) TEXT(BCD) (6) --> (4,TEXT)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) X12(BCD) (4) --> (4,X12)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BCD) (5) --> (4,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BCDE) (5) --> (5,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) ASCII(B) (4) --> (2,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256)
* (0,ASCII) B256(A) (3) --> (1,B256) EDF(BC) (6) --> (3,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) C40(BCD) (5) --> (4,C40)
* (0,ASCII) B256(A) (3) --> (1,B256) TEXT(BCD) (7) --> (4,TEXT)
* (0,ASCII) B256(A) (3) --> (1,B256) X12(BCD) (5) --> (4,X12)
* (0,ASCII) B256(A) (3) --> (1,B256) EDF(BCD) (6) --> (4,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) EDF(BCDE) (6) --> (5,EDF)
*
* Edge "(1,ASCII) ASCII(B) (2) --> (2,ASCII)" is minimal for the vertex (2,ASCII) so that edge "(1,B256) ASCII(B) (4) --> (2,ASCII)" is removed.
* Edge "(1,B256) B256(B) (3) --> (2,B256)" is minimal for the vertext (2,B256) so that the edge "(1,ASCII) B256(B) (4) --> (2,B256)" is removed.
*
* Situation after adding edges to vertices at position 2
* (0,ASCII) ASCII(A) (1) --> (1,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256)
* (0,ASCII) EDF(AB) (4) --> (2,EDF)
* (0,ASCII) C40(ABC) (3) --> (3,C40)
* (0,ASCII) TEXT(ABC) (5) --> (3,TEXT)
* (0,ASCII) X12(ABC) (3) --> (3,X12)
* (0,ASCII) EDF(ABC) (4) --> (3,EDF)
* (0,ASCII) EDF(ABCD) (4) --> (4,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BC) (5) --> (3,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) C40(BCD) (4) --> (4,C40)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) TEXT(BCD) (6) --> (4,TEXT)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) X12(BCD) (4) --> (4,X12)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BCD) (5) --> (4,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BCDE) (5) --> (5,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256)
* (0,ASCII) B256(A) (3) --> (1,B256) EDF(BC) (6) --> (3,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) C40(BCD) (5) --> (4,C40)
* (0,ASCII) B256(A) (3) --> (1,B256) TEXT(BCD) (7) --> (4,TEXT)
* (0,ASCII) B256(A) (3) --> (1,B256) X12(BCD) (5) --> (4,X12)
* (0,ASCII) B256(A) (3) --> (1,B256) EDF(BCD) (6) --> (4,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) EDF(BCDE) (6) --> (5,EDF)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) ASCII(C) (5) --> (3,ASCII)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) B256(C) (6) --> (3,B256)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) EDF(CD) (7) --> (4,EDF)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) C40(CDE) (6) --> (5,C40)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) TEXT(CDE) (8) --> (5,TEXT)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) X12(CDE) (6) --> (5,X12)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) EDF(CDE) (7) --> (5,EDF)
* (0,ASCII) EDF(AB) (4) --> (2,EDF) EDF(CDEF) (7) --> (6,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) B256(C) (5) --> (3,B256)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) EDF(CD) (6) --> (4,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) C40(CDE) (5) --> (5,C40)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) TEXT(CDE) (7) --> (5,TEXT)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) X12(CDE) (5) --> (5,X12)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) EDF(CDE) (6) --> (5,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) EDF(CDEF) (6) --> (6,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) ASCII(C) (4) --> (3,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) B256(C) (4) --> (3,B256)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) EDF(CD) (6) --> (4,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) C40(CDE) (5) --> (5,C40)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) TEXT(CDE) (7) --> (5,TEXT)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) X12(CDE) (5) --> (5,X12)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) EDF(CDE) (6) --> (5,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) EDF(CDEF) (6) --> (6,EDF)
*
* Edge "(2,ASCII) ASCII(C) (3) --> (3,ASCII)" is minimal for the vertex (3,ASCII) so that edges "(2,EDF) ASCII(C) (5) --> (3,ASCII)"
* and "(2,B256) ASCII(C) (4) --> (3,ASCII)" can be removed.
* Edge "(0,ASCII) EDF(ABC) (4) --> (3,EDF)" is minimal for the vertex (3,EDF) so that edges "(1,ASCII) EDF(BC) (5) --> (3,EDF)"
* and "(1,B256) EDF(BC) (6) --> (3,EDF)" can be removed.
* Edge "(2,B256) B256(C) (4) --> (3,B256)" is minimal for the vertex (3,B256) so that edges "(2,ASCII) B256(C) (5) --> (3,B256)"
* and "(2,EDF) B256(C) (6) --> (3,B256)" can be removed.
*
* This continues for vertices 3 thru 7
*
* Situation after adding edges to vertices at position 7
* (0,ASCII) ASCII(A) (1) --> (1,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256)
* (0,ASCII) EDF(AB) (4) --> (2,EDF)
* (0,ASCII) C40(ABC) (3) --> (3,C40)
* (0,ASCII) TEXT(ABC) (5) --> (3,TEXT)
* (0,ASCII) X12(ABC) (3) --> (3,X12)
* (0,ASCII) EDF(ABC) (4) --> (3,EDF)
* (0,ASCII) EDF(ABCD) (4) --> (4,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) C40(BCD) (4) --> (4,C40)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) TEXT(BCD) (6) --> (4,TEXT)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) X12(BCD) (4) --> (4,X12)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) EDF(BCDE) (5) --> (5,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256)
* (0,ASCII) C40(ABC) (3) --> (3,C40) C40(DEF) (5) --> (6,C40)
* (0,ASCII) X12(ABC) (3) --> (3,X12) X12(DEF) (5) --> (6,X12)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) C40(CDE) (5) --> (5,C40)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) TEXT(CDE) (7) --> (5,TEXT)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) X12(CDE) (5) --> (5,X12)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) EDF(CDEF) (6) --> (6,EDF)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) C40(BCD) (4) --> (4,C40) C40(EFG) (6) --> (7,C40) //Solution 1
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) X12(BCD) (4) --> (4,X12) X12(EFG) (6) --> (7,X12) //Solution 2
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) B256(C) (4) --> (3,B256)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) ASCII(D) (4) --> (4,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) TEXT(DEF) (8) --> (6,TEXT)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) EDF(DEFG) (7) --> (7,EDF)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) B256(C) (4) --> (3,B256) B256(D) (5) --> (4,B256)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) ASCII(D) (4) --> (4,ASCII) ASCII(E) (5) --> (5,ASCII)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) ASCII(D) (4) --> (4,ASCII) TEXT(EFG) (9) --> (7,TEXT)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) B256(C) (4) --> (3,B256) B256(D) (5) --> (4,B256) B256(E) (6) --> (5,B256)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) ASCII(D) (4) --> (4,ASCII) ASCII(E) (5) --> (5,ASCII) ASCII(F) (6) --> (6,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) B256(C) (4) --> (3,B256) B256(D) (5) --> (4,B256) B256(E) (6) --> (5,B256) B256(F) (7) --> (6,B256)
* (0,ASCII) ASCII(A) (1) --> (1,ASCII) ASCII(B) (2) --> (2,ASCII) ASCII(C) (3) --> (3,ASCII) ASCII(D) (4) --> (4,ASCII) ASCII(E) (5) --> (5,ASCII) ASCII(F) (6) --> (6,ASCII) ASCII(G) (7) --> (7,ASCII)
* (0,ASCII) B256(A) (3) --> (1,B256) B256(B) (3) --> (2,B256) B256(C) (4) --> (3,B256) B256(D) (5) --> (4,B256) B256(E) (6) --> (5,B256) B256(F) (7) --> (6,B256) B256(G) (8) --> (7,B256)
*
* Hence a minimal encoding of "ABCDEFG" is either ASCII(A),C40(BCDEFG) or ASCII(A), X12(BCDEFG) with a size of 5 bytes.
*/
int inputLength = input.length();
// Array that represents vertices. There is a vertex for every character and mode.
// The last dimension in the array below encodes the 6 modes ASCII, C40, TEXT, X12, EDF and B256
Edge[][] edges = new Edge[inputLength + 1][6];
addEdges(input, edges, 0, null);
for (int i = 1; i <= inputLength; i++) {
for (int j = 0; j < 6; j++) {
if (edges[i][j] != null && i < inputLength) {
addEdges(input, edges, i, edges[i][j]);
}
}
//optimize memory by removing edges that have been passed.
for (int j = 0; j < 6; j++) {
edges[i - 1][j] = null;
}
}
int minimalJ = -1;
int minimalSize = Integer.MAX_VALUE;
for (int j = 0; j < 6; j++) {
if (edges[inputLength][j] != null) {
Edge edge = edges[inputLength][j];
int size = j >= 1 && j <= 3 ? edge.cachedTotalSize + 1 : edge.cachedTotalSize; //C40, TEXT and X12 need an
// extra unlatch at the end
if (size < minimalSize) {
minimalSize = size;
minimalJ = j;
}
}
}
if (minimalJ < 0) {
throw new IllegalStateException("Failed to encode \"" + input + "\"");
}
return new Result(edges[inputLength][minimalJ]);
}
private static final class Edge {
private static final int[] allCodewordCapacities = {3, 5, 8, 10, 12, 16, 18, 22, 30, 32, 36, 44, 49, 62, 86, 114,
144, 174, 204, 280, 368, 456, 576, 696, 816, 1050, 1304, 1558};
private static final int[] squareCodewordCapacities = {3, 5, 8, 12, 18, 22, 30, 36, 44, 62, 86, 114, 144, 174, 204,
280, 368, 456, 576, 696, 816, 1050, 1304, 1558};
private static final int[] rectangularCodewordCapacities = {5, 10, 16, 33, 32, 49};
private final Input input;
private final Mode mode; //the mode at the start of this edge.
private final int fromPosition;
private final int characterLength;
private final Edge previous;
private final int cachedTotalSize;
private Edge(Input input, Mode mode, int fromPosition, int characterLength, Edge previous) {
this.input = input;
this.mode = mode;
this.fromPosition = fromPosition;
this.characterLength = characterLength;
this.previous = previous;
assert fromPosition + characterLength <= input.length();
int size = previous != null ? previous.cachedTotalSize : 0;
Mode previousMode = getPreviousMode();
/*
* Switching modes
* ASCII -> C40: latch 230
* ASCII -> TEXT: latch 239
* ASCII -> X12: latch 238
* ASCII -> EDF: latch 240
* ASCII -> B256: latch 231
* C40 -> ASCII: word(c1,c2,c3), 254
* TEXT -> ASCII: word(c1,c2,c3), 254
* X12 -> ASCII: word(c1,c2,c3), 254
* EDIFACT -> ASCII: Unlatch character,0,0,0 or c1,Unlatch character,0,0 or c1,c2,Unlatch character,0 or
* c1,c2,c3,Unlatch character
* B256 -> ASCII: without latch after n bytes
*/
switch (mode) {
case ASCII:
size++;
if (input.isECI(fromPosition) || isExtendedASCII(input.charAt(fromPosition), input.getFNC1Character())) {
size++;
}
if (previousMode == Mode.C40 ||
previousMode == Mode.TEXT ||
previousMode == Mode.X12) {
size++; // unlatch 254 to ASCII
}
break;
case B256:
size++;
if (previousMode != Mode.B256) {
size++; //byte count
} else if (getB256Size() == 250) {
size++; //extra byte count
}
if (previousMode == Mode.ASCII) {
size++; //latch to B256
} else if (previousMode == Mode.C40 ||
previousMode == Mode.TEXT ||
previousMode == Mode.X12) {
size += 2; //unlatch to ASCII, latch to B256
}
break;
case C40:
case TEXT:
case X12:
if (mode == Mode.X12) {
size += 2;
} else {
int[] charLen = new int[1];
size += getNumberOfC40Words(input, fromPosition, mode == Mode.C40, charLen) * 2;
}
if (previousMode == Mode.ASCII || previousMode == Mode.B256) {
size++; //additional byte for latch from ASCII to this mode
} else if (previousMode != mode && (previousMode == Mode.C40 ||
previousMode == Mode.TEXT ||
previousMode == Mode.X12)) {
size += 2; //unlatch 254 to ASCII followed by latch to this mode
}
break;
case EDF:
size += 3;
if (previousMode == Mode.ASCII || previousMode == Mode.B256) {
size++; //additional byte for latch from ASCII to this mode
} else if (previousMode == Mode.C40 ||
previousMode == Mode.TEXT ||
previousMode == Mode.X12) {
size += 2; //unlatch 254 to ASCII followed by latch to this mode
}
break;
}
cachedTotalSize = size;
}
// does not count beyond 250
int getB256Size() {
int cnt = 0;
Edge current = this;
while (current != null && current.mode == Mode.B256 && cnt <= 250) {
cnt++;
current = current.previous;
}
return cnt;
}
Mode getPreviousStartMode() {
return previous == null ? Mode.ASCII : previous.mode;
}
Mode getPreviousMode() {
return previous == null ? Mode.ASCII : previous.getEndMode();
}
/** Returns Mode.ASCII in case that:
* - Mode is EDIFACT and characterLength is less than 4 or the remaining characters can be encoded in at most 2
* ASCII bytes.
* - Mode is C40, TEXT or X12 and the remaining characters can be encoded in at most 1 ASCII byte.
* Returns mode in all other cases.
* */
Mode getEndMode() {
if (mode == Mode.EDF) {
if (characterLength < 4) {
return Mode.ASCII;
}
int lastASCII = getLastASCII(); // see 5.2.8.2 EDIFACT encodation Rules
if (lastASCII > 0 && getCodewordsRemaining(cachedTotalSize + lastASCII) <= 2 - lastASCII) {
return Mode.ASCII;
}
}
if (mode == Mode.C40 ||
mode == Mode.TEXT ||
mode == Mode.X12) {
// see 5.2.5.2 C40 encodation rules and 5.2.7.2 ANSI X12 encodation rules
if (fromPosition + characterLength >= input.length() && getCodewordsRemaining(cachedTotalSize) == 0) {
return Mode.ASCII;
}
int lastASCII = getLastASCII();
if (lastASCII == 1 && getCodewordsRemaining(cachedTotalSize + 1) == 0) {
return Mode.ASCII;
}
}
return mode;
}
Mode getMode() {
return mode;
}
/** Peeks ahead and returns 1 if the postfix consists of exactly two digits, 2 if the postfix consists of exactly
* two consecutive digits and a non extended character or of 4 digits.
* Returns 0 in any other case
**/
int getLastASCII() {
int length = input.length();
int from = fromPosition + characterLength;
if (length - from > 4 || from >= length) {
return 0;
}
if (length - from == 1) {
if (isExtendedASCII(input.charAt(from), input.getFNC1Character())) {
return 0;
}
return 1;
}
if (length - from == 2) {
if (isExtendedASCII(input.charAt(from), input.getFNC1Character()) || isExtendedASCII(input.charAt(from + 1),
input.getFNC1Character())) {
return 0;
}
if (HighLevelEncoder.isDigit(input.charAt(from)) && HighLevelEncoder.isDigit(input.charAt(from + 1))) {
return 1;
}
return 2;
}
if (length - from == 3) {
if (HighLevelEncoder.isDigit(input.charAt(from)) && HighLevelEncoder.isDigit(input.charAt(from + 1))
&& !isExtendedASCII(input.charAt(from + 2), input.getFNC1Character())) {
return 2;
}
if (HighLevelEncoder.isDigit(input.charAt(from + 1)) && HighLevelEncoder.isDigit(input.charAt(from + 2))
&& !isExtendedASCII(input.charAt(from), input.getFNC1Character())) {
return 2;
}
return 0;
}
if (HighLevelEncoder.isDigit(input.charAt(from)) && HighLevelEncoder.isDigit(input.charAt(from + 1))
&& HighLevelEncoder.isDigit(input.charAt(from + 2)) && HighLevelEncoder.isDigit(input.charAt(from + 3))) {
return 2;
}
return 0;
}
/** Returns the capacity in codewords of the smallest symbol that has enough capacity to fit the given minimal
* number of codewords.
**/
int getMinSymbolSize(int minimum) {
switch (input.getShapeHint()) {
case FORCE_SQUARE:
for (int capacity : squareCodewordCapacities) {
if (capacity >= minimum) {
return capacity;
}
}
break;
case FORCE_RECTANGLE:
for (int capacity : rectangularCodewordCapacities) {
if (capacity >= minimum) {
return capacity;
}
}
break;
}
for (int capacity : allCodewordCapacities) {
if (capacity >= minimum) {
return capacity;
}
}
return allCodewordCapacities[allCodewordCapacities.length - 1];
}
/** Returns the remaining capacity in codewords of the smallest symbol that has enough capacity to fit the given
* minimal number of codewords.
**/
int getCodewordsRemaining(int minimum) {
return getMinSymbolSize(minimum) - minimum;
}
static byte[] getBytes(int c) {
byte[] result = new byte[1];
result[0] = (byte) c;
return result;
}
static byte[] getBytes(int c1,int c2) {
byte[] result = new byte[2];
result[0] = (byte) c1;
result[1] = (byte) c2;
return result;
}
static void setC40Word(byte[] bytes, int offset, int c1, int c2, int c3) {
int val16 = (1600 * (c1 & 0xff)) + (40 * (c2 & 0xff)) + (c3 & 0xff) + 1;
bytes[offset] = (byte) (val16 / 256);
bytes[offset + 1] = (byte) (val16 % 256);
}
private static int getX12Value(char c) {
return c == 13 ? 0 :
c == 42 ? 1 :
c == 62 ? 2 :
c == 32 ? 3 :
c >= 48 && c <= 57 ? c - 44 :
c >= 65 && c <= 90 ? c - 51 : c;
}
byte[] getX12Words() {
assert characterLength % 3 == 0;
byte[] result = new byte[characterLength / 3 * 2];
for (int i = 0; i < result.length; i += 2) {
setC40Word(result,i,getX12Value(input.charAt(fromPosition + i / 2 * 3)),
getX12Value(input.charAt(fromPosition + i / 2 * 3 + 1)),
getX12Value(input.charAt(fromPosition + i / 2 * 3 + 2)));
}
return result;
}
static int getShiftValue(char c, boolean c40, int fnc1) {
return (c40 && isInC40Shift1Set(c) ||
!c40 && isInTextShift1Set(c)) ? 0 :
(c40 && isInC40Shift2Set(c, fnc1) ||
!c40 && isInTextShift2Set(c, fnc1)) ? 1 : 2;
}
private static int getC40Value(boolean c40, int setIndex, char c, int fnc1) {
if (c == fnc1) {
assert setIndex == 2;
return 27;
}
if (c40) {
return c <= 31 ? c :
c == 32 ? 3 :
c <= 47 ? c - 33 :
c <= 57 ? c - 44 :
c <= 64 ? c - 43 :
c <= 90 ? c - 51 :
c <= 95 ? c - 69 :
c <= 127 ? c - 96 : c;
} else {
return c == 0 ? 0 :
setIndex == 0 && c <= 3 ? c - 1 : //is this a bug in the spec?
setIndex == 1 && c <= 31 ? c :
c == 32 ? 3 :
c >= 33 && c <= 47 ? c - 33 :
c >= 48 && c <= 57 ? c - 44 :
c >= 58 && c <= 64 ? c - 43 :
c >= 65 && c <= 90 ? c - 64 :
c >= 91 && c <= 95 ? c - 69 :
c == 96 ? 0 :
c >= 97 && c <= 122 ? c - 83 :
c >= 123 && c <= 127 ? c - 96 : c;
}
}
byte[] getC40Words(boolean c40, int fnc1) {
List<Byte> c40Values = new ArrayList<>();
for (int i = 0; i < characterLength; i++) {
char ci = input.charAt(fromPosition + i);
if (c40 && HighLevelEncoder.isNativeC40(ci) || !c40 && HighLevelEncoder.isNativeText(ci)) {
c40Values.add((byte) getC40Value(c40, 0, ci, fnc1));
} else if (!isExtendedASCII(ci, fnc1)) {
int shiftValue = getShiftValue(ci, c40, fnc1);
c40Values.add((byte) shiftValue); //Shift[123]
c40Values.add((byte) getC40Value(c40, shiftValue, ci, fnc1));
} else {
char asciiValue = (char) ((ci & 0xff) - 128);
if (c40 && HighLevelEncoder.isNativeC40(asciiValue) ||
!c40 && HighLevelEncoder.isNativeText(asciiValue)) {
c40Values.add((byte) 1); //Shift 2
c40Values.add((byte) 30); //Upper Shift
c40Values.add((byte) getC40Value(c40, 0, asciiValue, fnc1));
} else {
c40Values.add((byte) 1); //Shift 2
c40Values.add((byte) 30); //Upper Shift
int shiftValue = getShiftValue(asciiValue, c40, fnc1);
c40Values.add((byte) shiftValue); // Shift[123]
c40Values.add((byte) getC40Value(c40, shiftValue, asciiValue, fnc1));
}
}
}
if ((c40Values.size() % 3) != 0) {
assert (c40Values.size() - 2) % 3 == 0 && fromPosition + characterLength == input.length();
c40Values.add((byte) 0); // pad with 0 (Shift 1)
}
byte[] result = new byte[c40Values.size() / 3 * 2];
int byteIndex = 0;
for (int i = 0; i < c40Values.size(); i += 3) {
setC40Word(result,byteIndex, c40Values.get(i) & 0xff, c40Values.get(i + 1) & 0xff, c40Values.get(i + 2) & 0xff);
byteIndex += 2;
}
return result;
}
byte[] getEDFBytes() {
int numberOfThirds = (int) Math.ceil(characterLength / 4.0);
byte[] result = new byte[numberOfThirds * 3];
int pos = fromPosition;
int endPos = Math.min(fromPosition + characterLength - 1 , input.length() - 1);
for (int i = 0; i < numberOfThirds; i += 3) {
int[] edfValues = new int[4];
for (int j = 0; j < 4; j++) {
if (pos <= endPos) {
edfValues[j] = input.charAt(pos++) & 0x3f;
} else {
edfValues[j] = pos == endPos + 1 ? 0x1f : 0;
}
}
int val24 = edfValues[0] << 18;
val24 |= edfValues[1] << 12;
val24 |= edfValues[2] << 6;
val24 |= edfValues[3];
result[i] = (byte) ((val24 >> 16) & 0xff);
result[i + 1] = (byte) ((val24 >> 8) & 0xff);
result[i + 2] = (byte) (val24 & 0xff);
}
return result;
}
byte[] getLatchBytes() {
switch (getPreviousMode()) {
case ASCII:
case B256: //after B256 ends (via length) we are back to ASCII
switch (mode) {
case B256:
return getBytes(231);
case C40:
return getBytes(230);
case TEXT:
return getBytes(239);
case X12:
return getBytes(238);
case EDF:
return getBytes(240);
}
break;
case C40:
case TEXT:
case X12:
if (mode != getPreviousMode()) {
switch (mode) {
case ASCII:
return getBytes(254);
case B256:
return getBytes(254, 231);
case C40:
return getBytes(254, 230);
case TEXT:
return getBytes(254, 239);
case X12:
return getBytes(254, 238);
case EDF:
return getBytes(254, 240);
}
}
break;
case EDF:
assert mode == Mode.EDF; //The rightmost EDIFACT edge always contains an unlatch character
break;
}
return new byte[0];
}
// Important: The function does not return the length bytes (one or two) in case of B256 encoding
byte[] getDataBytes() {
switch (mode) {
case ASCII:
if (input.isECI(fromPosition)) {
return getBytes(241,input.getECIValue(fromPosition) + 1);
} else if (isExtendedASCII(input.charAt(fromPosition), input.getFNC1Character())) {
return getBytes(235,input.charAt(fromPosition) - 127);
} else if (characterLength == 2) {
return getBytes((input.charAt(fromPosition) - '0') * 10 + input.charAt(fromPosition + 1) - '0' + 130);
} else if (input.isFNC1(fromPosition)) {
return getBytes(232);
} else {
return getBytes(input.charAt(fromPosition) + 1);
}
case B256:
return getBytes(input.charAt(fromPosition));
case C40:
return getC40Words(true, input.getFNC1Character());
case TEXT:
return getC40Words(false, input.getFNC1Character());
case X12:
return getX12Words();
case EDF:
return getEDFBytes();
}
assert false;
return new byte[0];
}
}
private static final class Result {
private final byte[] bytes;
Result(Edge solution) {
Input input = solution.input;
int size = 0;
List<Byte> bytesAL = new ArrayList<>();
List<Integer> randomizePostfixLength = new ArrayList<>();
List<Integer> randomizeLengths = new ArrayList<>();
if ((solution.mode == Mode.C40 ||
solution.mode == Mode.TEXT ||
solution.mode == Mode.X12) &&
solution.getEndMode() != Mode.ASCII) {
size += prepend(MinimalEncoder.Edge.getBytes(254),bytesAL);
}
Edge current = solution;
while (current != null) {
size += prepend(current.getDataBytes(),bytesAL);
if (current.previous == null || current.getPreviousStartMode() != current.getMode()) {
if (current.getMode() == Mode.B256) {
if (size <= 249) {
bytesAL.add(0, (byte) size);
size++;
} else {
bytesAL.add(0, (byte) (size % 250));
bytesAL.add(0, (byte) (size / 250 + 249));
size += 2;
}
randomizePostfixLength.add(bytesAL.size());
randomizeLengths.add(size);
}
prepend(current.getLatchBytes(), bytesAL);
size = 0;
}
current = current.previous;
}
if (input.getMacroId() == 5) {
size += prepend(MinimalEncoder.Edge.getBytes(236), bytesAL);
} else if (input.getMacroId() == 6) {
size += prepend(MinimalEncoder.Edge.getBytes(237), bytesAL);
}
if (input.getFNC1Character() > 0) {
size += prepend(MinimalEncoder.Edge.getBytes(232), bytesAL);
}
for (int i = 0; i < randomizePostfixLength.size(); i++) {
applyRandomPattern(bytesAL,bytesAL.size() - randomizePostfixLength.get(i), randomizeLengths.get(i));
}
//add padding
int capacity = solution.getMinSymbolSize(bytesAL.size());
if (bytesAL.size() < capacity) {
bytesAL.add((byte) 129);
}
while (bytesAL.size() < capacity) {
bytesAL.add((byte) randomize253State(bytesAL.size() + 1));
}
bytes = new byte[bytesAL.size()];
for (int i = 0; i < bytes.length; i++) {
bytes[i] = bytesAL.get(i);
}
}
static int prepend(byte[] bytes, List<Byte> into) {
for (int i = bytes.length - 1; i >= 0; i--) {
into.add(0, bytes[i]);
}
return bytes.length;
}
private static int randomize253State(int codewordPosition) {
int pseudoRandom = ((149 * codewordPosition) % 253) + 1;
int tempVariable = 129 + pseudoRandom;
return tempVariable <= 254 ? tempVariable : tempVariable - 254;
}
static void applyRandomPattern(List<Byte> bytesAL,int startPosition, int length) {
for (int i = 0; i < length; i++) {
//See "B.1 253-state algorithm
int Pad_codeword_position = startPosition + i;
int Pad_codeword_value = bytesAL.get(Pad_codeword_position) & 0xff;
int pseudo_random_number = ((149 * (Pad_codeword_position + 1)) % 255) + 1;
int temp_variable = Pad_codeword_value + pseudo_random_number;
bytesAL.set(Pad_codeword_position, (byte) (temp_variable <= 255 ? temp_variable : temp_variable - 256));
}
}
public byte[] getBytes() {
return bytes;
}
}
private static final class Input extends MinimalECIInput {
private final SymbolShapeHint shape;
private final int macroId;
private Input(String stringToEncode, Charset priorityCharset, int fnc1, SymbolShapeHint shape, int macroId) {
super(stringToEncode, priorityCharset, fnc1);
this.shape = shape;
this.macroId = macroId;
}
private int getMacroId() {
return macroId;
}
private SymbolShapeHint getShapeHint() {
return shape;
}
}
}