ccutil/unicharmap.cpp - platform/external/tesseract - Git at Google

 ///////////////////////////////////////////////////////////////////////
 // File:        unicharmap.cpp
 // Description: Unicode character/ligature to integer id class.
 // Author:      Thomas Kielbus
 // Created:     Wed Jun 28 17:05:01 PDT 2006
 //
 // (C) Copyright 2006, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 ///////////////////////////////////////////////////////////////////////

 #include <assert.h>
 #include "unichar.h"
 #include "host.h"
 #include "unicharmap.h"

 UNICHARMAP::UNICHARMAP() :
 nodes(0) {
 }

 UNICHARMAP::~UNICHARMAP() {
   if (nodes != 0)
     delete[] nodes;
 }

 // Search the given unichar representation in the tree. Each character in the
 // string is interpreted as an index in an array of nodes.
 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
   const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;

   assert(*unichar_repr != '\0');

   do {
     if (*(current_char + 1) == '\0')
       return current_nodes[static_cast<unsigned char>(*current_char)].id;
     current_nodes =
         current_nodes[static_cast<unsigned char>(*current_char)].children;
     ++current_char;
   } while (true);
 }

 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes.
 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
                                      int length) const {
   const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;

   assert(*unichar_repr != '\0');
   assert(length > 0 && length <= UNICHAR_LEN);

   do {
     if (length == 1 || *(current_char + 1) == '\0')
       return current_nodes[static_cast<unsigned char>(*current_char)].id;
     current_nodes =
         current_nodes[static_cast<unsigned char>(*current_char)].children;
     ++current_char;
     --length;
   } while (true);
 }

 // Search the given unichar representation in the tree, creating the possibly
 // missing nodes. Once the right place has been found, insert the given id and
 // update the inserted flag to keep track of the insert. Each character in the
 // string is interpreted as an index in an array of nodes.
 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
   const char* current_char = unichar_repr;
   UNICHARMAP_NODE** current_nodes_pointer = &nodes;

   assert(*unichar_repr != '\0');
   assert(id >= 0);

   do {
     if (*current_nodes_pointer == 0)
       *current_nodes_pointer = new UNICHARMAP_NODE[256];
     if (*(current_char + 1) == '\0') {
       (*current_nodes_pointer)
           [static_cast<unsigned char>(*current_char)].id = id;
       return;
     }
     current_nodes_pointer =
         &((*current_nodes_pointer)
           [static_cast<unsigned char>(*current_char)].children);
     ++current_char;
   } while (true);
 }

 // Search the given unichar representation in the tree. Each character in the
 // string is interpreted as an index in an array of nodes. Stop once the tree
 // does not have anymore nodes or once we found the right unichar_repr.
 bool UNICHARMAP::contains(const char* const unichar_repr) const {
   const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;

   assert(*unichar_repr != '\0');

   while (current_nodes != 0 && *(current_char + 1) != '\0') {
     current_nodes =
         current_nodes[static_cast<unsigned char>(*current_char)].children;
     ++current_char;
   }
   return current_nodes != 0 && *(current_char + 1) == '\0' &&
       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
 }

 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes. Stop once the tree does not have anymore nodes or once we
 // found the right unichar_repr.
 bool UNICHARMAP::contains(const char* const unichar_repr,
                           int length) const {
   const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;

   assert(*unichar_repr != '\0');
   assert(length > 0 && length <= UNICHAR_LEN);

   while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
     current_nodes =
         current_nodes[static_cast<unsigned char>(*current_char)].children;
     --length;
     ++current_char;
   }
   return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
 }

 // Return the minimum number of characters that must be used from this string
 // to obtain a match in the UNICHARMAP.
 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
   const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;

   while (current_nodes != NULL && *current_char != '\0') {
     if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
       return current_char + 1 - unichar_repr;
     current_nodes =
         current_nodes[static_cast<unsigned char>(*current_char)].children;
     ++current_char;
   }
   return 0;
 }

 void UNICHARMAP::clear() {
   if (nodes != 0)
   {
     delete[] nodes;
     nodes = 0;
   }
 }

 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
 children(0),
 id(-1) {
 }

 // Recursively delete the children
 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
   if (children != 0) {
     delete[] children;
   }
 }
	///////////////////////////////////////////////////////////////////////
	// File: unicharmap.cpp
	// Description: Unicode character/ligature to integer id class.
	// Author: Thomas Kielbus
	// Created: Wed Jun 28 17:05:01 PDT 2006
	//
	// (C) Copyright 2006, Google Inc.
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	// http://www.apache.org/licenses/LICENSE-2.0
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	///////////////////////////////////////////////////////////////////////

	#include <assert.h>
	#include "unichar.h"
	#include "host.h"
	#include "unicharmap.h"

	UNICHARMAP::UNICHARMAP() :
	nodes(0) {
	}

	UNICHARMAP::~UNICHARMAP() {
	if (nodes != 0)
	delete[] nodes;
	}

	// Search the given unichar representation in the tree. Each character in the
	// string is interpreted as an index in an array of nodes.
	UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
	const char* current_char = unichar_repr;
	UNICHARMAP_NODE* current_nodes = nodes;

	assert(*unichar_repr != '\0');

	do {
	if (*(current_char + 1) == '\0')
	return current_nodes[static_cast<unsigned char>(*current_char)].id;
	current_nodes =
	current_nodes[static_cast<unsigned char>(*current_char)].children;
	++current_char;
	} while (true);
	}

	// Search the given unichar representation in the tree, using length characters
	// from it maximum. Each character in the string is interpreted as an index in
	// an array of nodes.
	UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
	int length) const {
	const char* current_char = unichar_repr;
	UNICHARMAP_NODE* current_nodes = nodes;

	assert(*unichar_repr != '\0');
	assert(length > 0 && length <= UNICHAR_LEN);

	do {
	if (length == 1 \|\| *(current_char + 1) == '\0')
	return current_nodes[static_cast<unsigned char>(*current_char)].id;
	current_nodes =
	current_nodes[static_cast<unsigned char>(*current_char)].children;
	++current_char;
	--length;
	} while (true);
	}

	// Search the given unichar representation in the tree, creating the possibly
	// missing nodes. Once the right place has been found, insert the given id and
	// update the inserted flag to keep track of the insert. Each character in the
	// string is interpreted as an index in an array of nodes.
	void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
	const char* current_char = unichar_repr;
	UNICHARMAP_NODE** current_nodes_pointer = &nodes;

	assert(*unichar_repr != '\0');
	assert(id >= 0);

	do {
	if (*current_nodes_pointer == 0)
	*current_nodes_pointer = new UNICHARMAP_NODE[256];
	if (*(current_char + 1) == '\0') {
	(*current_nodes_pointer)
	[static_cast<unsigned char>(*current_char)].id = id;
	return;
	}
	current_nodes_pointer =
	&((*current_nodes_pointer)
	[static_cast<unsigned char>(*current_char)].children);
	++current_char;
	} while (true);
	}

	// Search the given unichar representation in the tree. Each character in the
	// string is interpreted as an index in an array of nodes. Stop once the tree
	// does not have anymore nodes or once we found the right unichar_repr.
	bool UNICHARMAP::contains(const char* const unichar_repr) const {
	const char* current_char = unichar_repr;
	UNICHARMAP_NODE* current_nodes = nodes;

	assert(*unichar_repr != '\0');

	while (current_nodes != 0 && *(current_char + 1) != '\0') {
	current_nodes =
	current_nodes[static_cast<unsigned char>(*current_char)].children;
	++current_char;
	}
	return current_nodes != 0 && *(current_char + 1) == '\0' &&
	current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
	}

	// Search the given unichar representation in the tree, using length characters
	// from it maximum. Each character in the string is interpreted as an index in
	// an array of nodes. Stop once the tree does not have anymore nodes or once we
	// found the right unichar_repr.
	bool UNICHARMAP::contains(const char* const unichar_repr,
	int length) const {
	const char* current_char = unichar_repr;
	UNICHARMAP_NODE* current_nodes = nodes;

	assert(*unichar_repr != '\0');
	assert(length > 0 && length <= UNICHAR_LEN);

	while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
	current_nodes =
	current_nodes[static_cast<unsigned char>(*current_char)].children;
	--length;
	++current_char;
	}
	return current_nodes != 0 && (length == 1 \|\| *(current_char + 1) == '\0') &&
	current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
	}

	// Return the minimum number of characters that must be used from this string
	// to obtain a match in the UNICHARMAP.
	int UNICHARMAP::minmatch(const char* const unichar_repr) const {
	const char* current_char = unichar_repr;
	UNICHARMAP_NODE* current_nodes = nodes;

	while (current_nodes != NULL && *current_char != '\0') {
	if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
	return current_char + 1 - unichar_repr;
	current_nodes =
	current_nodes[static_cast<unsigned char>(*current_char)].children;
	++current_char;
	}
	return 0;
	}

	void UNICHARMAP::clear() {
	if (nodes != 0)
	{
	delete[] nodes;
	nodes = 0;
	}
	}

	UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
	children(0),
	id(-1) {
	}

	// Recursively delete the children
	UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
	if (children != 0) {
	delete[] children;
	}
	}