ccutil/boxread.cpp - platform/external/tesseract - Git at Google

 /**********************************************************************
  * File:        boxread.cpp
  * Description: Read data from a box file.
  * Author:      Ray Smith
  * Created:     Fri Aug 24 17:47:23 PDT 2007
  *
  * (C) Copyright 2007, Google Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #include "mfcpch.h"
 #include <string.h>
 #include "boxread.h"
 #include "unichar.h"
 #include "tprintf.h"

 // Box files are used ONLY DURING TRAINING, but by both processes of
 // creating tr files with tesseract, and unicharset_extractor.
 // read_next_box factors out the code to interpret a line of a box
 // file so that applybox and unicharset_extractor interpret the same way.
 // This function returns the next valid box file utf8 string and coords
 // and returns true, or false on eof (and closes the file).
 // It ignores the uft8 file signature, checks for valid utf-8 and allows
 // space or tab between fields.
 // utf8_str must be at least kBoxReadBufSize in length.
 // If there are page numbers in the file, it reads them all.
 bool read_next_box(FILE* box_file, char* utf8_str,
                    int* x_min, int* y_min, int* x_max, int* y_max) {
   return read_next_box(-1, box_file, utf8_str,
                        x_min, y_min, x_max, y_max);
 }

 // As read_next_box above, but get a specific page number. (0-based)
 // Use -1 to read any page number. Files without page number all
 // read as if they are page 0.
 bool read_next_box(int target_page, FILE* box_file, char* utf8_str,
                    int* x_min, int* y_min, int* x_max, int* y_max) {
   static int line = 0;
   int count = 0;
   int page = 0;
   char buff[kBoxReadBufSize];                //boxfile read buffer
   char uch[kBoxReadBufSize];
   char *buffptr = buff;

   while (fgets(buff, sizeof(buff) - 1, box_file)) {
     line++;

     buffptr = buff;
     const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
     if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
       buffptr += 3;  // Skip unicode file designation.
     /* Check for blank lines in box file */
     while (*buffptr == ' ' || *buffptr == '\t')
       buffptr++;
     if (*buffptr != '\0') {
       count = sscanf(buffptr, "%s %d %d %d %d %d",
                      uch, x_min, y_min, x_max, y_max, &page);
       if (count != 6) {
         page = 0;
         count = sscanf(buffptr, "%s %d %d %d %d",
                        uch, x_min, y_min, x_max, y_max);
       }
       if (target_page >= 0 && target_page != page)
         continue;  // Not on the appropriate page.
       if (count == 5) {
         // Validate UTF8 by making unichars with it.
         int used = 0;
         int uch_len = strlen(uch);
         while (used < uch_len) {
           UNICHAR ch(uch + used, uch_len - used);
           int new_used = ch.utf8_len();
           if (new_used == 0) {
             tprintf("Bad utf-8 char starting with 0x%x at line %d, col %d, \n",
                     uch[used], used + 1, line);
             count = 0;
             break;
           }
           used += new_used;
         }
         if (uch_len > UNICHAR_LEN) {
           tprintf("utf-8 string too long at line %d\n", line);
           count = 0;
         }
       }
       if (count < 5) {
         tprintf("Box file format error on line %i ignored\n", line);
       } else {
         strcpy(utf8_str, uch);
         return true;             //read a box ok
       }
     }
   }
   fclose(box_file);
   line = 0;
   return false;                  //EOF
 }
	/**********************************************************************
	* File: boxread.cpp
	* Description: Read data from a box file.
	* Author: Ray Smith
	* Created: Fri Aug 24 17:47:23 PDT 2007
	*
	* (C) Copyright 2007, Google Inc.
	** Licensed under the Apache License, Version 2.0 (the "License");
	** you may not use this file except in compliance with the License.
	** You may obtain a copy of the License at
	** http://www.apache.org/licenses/LICENSE-2.0
	** Unless required by applicable law or agreed to in writing, software
	** distributed under the License is distributed on an "AS IS" BASIS,
	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	** See the License for the specific language governing permissions and
	** limitations under the License.
	*
	**********************************************************************/

	#include "mfcpch.h"
	#include <string.h>
	#include "boxread.h"
	#include "unichar.h"
	#include "tprintf.h"

	// Box files are used ONLY DURING TRAINING, but by both processes of
	// creating tr files with tesseract, and unicharset_extractor.
	// read_next_box factors out the code to interpret a line of a box
	// file so that applybox and unicharset_extractor interpret the same way.
	// This function returns the next valid box file utf8 string and coords
	// and returns true, or false on eof (and closes the file).
	// It ignores the uft8 file signature, checks for valid utf-8 and allows
	// space or tab between fields.
	// utf8_str must be at least kBoxReadBufSize in length.
	// If there are page numbers in the file, it reads them all.
	bool read_next_box(FILE* box_file, char* utf8_str,
	int* x_min, int* y_min, int* x_max, int* y_max) {
	return read_next_box(-1, box_file, utf8_str,
	x_min, y_min, x_max, y_max);
	}

	// As read_next_box above, but get a specific page number. (0-based)
	// Use -1 to read any page number. Files without page number all
	// read as if they are page 0.
	bool read_next_box(int target_page, FILE* box_file, char* utf8_str,
	int* x_min, int* y_min, int* x_max, int* y_max) {
	static int line = 0;
	int count = 0;
	int page = 0;
	char buff[kBoxReadBufSize]; //boxfile read buffer
	char uch[kBoxReadBufSize];
	char *buffptr = buff;

	while (fgets(buff, sizeof(buff) - 1, box_file)) {
	line++;

	buffptr = buff;
	const unsigned char ubuf = reinterpret_cast<const unsigned char>(buffptr);
	if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
	buffptr += 3; // Skip unicode file designation.
	/* Check for blank lines in box file */
	while (buffptr == ' ' \|\| buffptr == '\t')
	buffptr++;
	if (*buffptr != '\0') {
	count = sscanf(buffptr, "%s %d %d %d %d %d",
	uch, x_min, y_min, x_max, y_max, &page);
	if (count != 6) {
	page = 0;
	count = sscanf(buffptr, "%s %d %d %d %d",
	uch, x_min, y_min, x_max, y_max);
	}
	if (target_page >= 0 && target_page != page)
	continue; // Not on the appropriate page.
	if (count == 5) {
	// Validate UTF8 by making unichars with it.
	int used = 0;
	int uch_len = strlen(uch);
	while (used < uch_len) {
	UNICHAR ch(uch + used, uch_len - used);
	int new_used = ch.utf8_len();
	if (new_used == 0) {
	tprintf("Bad utf-8 char starting with 0x%x at line %d, col %d, \n",
	uch[used], used + 1, line);
	count = 0;
	break;
	}
	used += new_used;
	}
	if (uch_len > UNICHAR_LEN) {
	tprintf("utf-8 string too long at line %d\n", line);
	count = 0;
	}
	}
	if (count < 5) {
	tprintf("Box file format error on line %i ignored\n", line);
	} else {
	strcpy(utf8_str, uch);
	return true; //read a box ok
	}
	}
	}
	fclose(box_file);
	line = 0;
	return false; //EOF
	}