blob: 0d927757ee9a7892b8dd26649d5c60d44fe9fec6 [file] [log] [blame]
/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
- This software is distributed in the hope that it will be
- useful, but with NO WARRANTY OF ANY KIND.
- No author or distributor accepts responsibility to anyone for the
- consequences of using this software, or for whether it serves any
- particular purpose or works at all, unless he or she says so in
- writing. Everyone is granted permission to copy, modify and
- redistribute this source code, for commercial or non-commercial
- purposes, with the following restrictions: (1) the origin of this
- source code must not be misrepresented; (2) modified versions must
- be plainly marked as such; and (3) this notice may not be removed
- or altered from any source or modified source distribution.
*====================================================================*/
/*
* sarray.c
*
* Create/Destroy/Copy
* SARRAY *sarrayCreate()
* SARRAY *sarrayCreateWordsFromString()
* SARRAY *sarrayCreateLinesFromString()
* void *sarrayDestroy()
* SARRAY *sarrayCopy()
* SARRAY *sarrayClone()
*
* Add/Remove string
* l_int32 sarrayAddString()
* l_int32 sarrayExtendArray()
* char *sarrayRemoveString()
* l_int32 sarrayClear()
*
* Accessors
* l_int32 sarrayGetCount()
* char **sarrayGetArray()
* char *sarrayGetString()
* l_int32 sarrayGetRefcount()
* l_int32 sarrayChangeRefcount()
*
* Conversion back to string
* char *sarrayToString()
* char *sarrayToStringRange()
*
* Concatenate 2 sarrays
* l_int32 sarrayConcatenate()
* l_int32 sarrayAppendRange()
*
* Convert word sarray to (formatted) line sarray
* SARRAY *sarrayConvertWordsToLines()
*
* Split string on separator list
* SARRAY *sarraySplitString()
*
* Filter sarray
* SARRAY *sarraySelectBySubstring()
* l_int32 sarrayParseRange()
*
* Sort
* SARRAY *sarraySort()
* l_int32 stringCompareLexical()
*
* Serialize for I/O
* SARRAY *sarrayRead()
* SARRAY *sarrayReadStream()
* l_int32 sarrayWrite()
* l_int32 sarrayWriteStream()
* l_int32 sarrayAppend()
*
* Directory filenames
* SARRAY *getSortedPathnamesInDirectory()
* SARRAY *getFilenamesInDirectory()
*
*
* Comments on usage:
*
* These functions are important for efficient manipulation
* of string data. They have been used in leptonica for
* generating and parsing text files, and for generating
* code for compilation. The user is responsible for
* correctly disposing of strings that have been extracted
* from sarrays.
*
* - When you want a string from an Sarray to inspect it, or
* plan to make a copy of it later, use sarrayGetString()
* with copyflag = 0. In this case, you must neither free
* the string nor put it directly in another array.
* We provide the copyflag constant L_NOCOPY, which is 0,
* for this purpose:
* str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
* To extract a copy of a string, use:
* str-owned = sarrayGetString(sa, index, L_COPY);
*
* - When you want to insert a string that is in one
* array into another array (always leaving the first
* array intact), you have two options:
* (1) use copyflag = L_COPY to make an immediate copy,
* which you must then add to the second array
* by insertion; namely,
* str-owned = sarrayGetString(sa, index, L_COPY);
* sarrayAddString(sa, str-owned, L_INSERT);
* (2) use copyflag = L_NOCOPY to get another handle to
* the string, in which case you must add
* a copy of it to the second string array:
* str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
* sarrayAddString(sa, str-not-owned, L_COPY).
*
* In all cases, when you use copyflag = L_COPY to extract
* a string from an array, you must either free it
* or insert it in an array that will be freed later.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#ifndef COMPILER_MSVC
#include <dirent.h> /* unix only */
#endif /* !COMPILER_MSVC */
#include "allheaders.h"
static const l_int32 INITIAL_PTR_ARRAYSIZE = 50; /* n'importe quoi */
static const l_int32 L_BUF_SIZE = 512;
/*--------------------------------------------------------------------------*
* String array create/destroy/copy/extend *
*--------------------------------------------------------------------------*/
/*!
* sarrayCreate()
*
* Input: size of string ptr array to be alloc'd
* (use 0 for default)
* Return: sarray, or null on error
*/
SARRAY *
sarrayCreate(l_int32 n)
{
SARRAY *sa;
PROCNAME("sarrayCreate");
if (n <= 0)
n = INITIAL_PTR_ARRAYSIZE;
if ((sa = (SARRAY *)CALLOC(1, sizeof(SARRAY))) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
if ((sa->array = (char **)CALLOC(n, sizeof(char *))) == NULL)
return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
sa->nalloc = n;
sa->n = 0;
sa->refcount = 1;
return sa;
}
/*!
* sarrayCreateWordsFromString()
*
* Input: string
* Return: sarray, or null on error
*
* Notes:
* (1) This finds the number of word substrings, creates an sarray
* of this size, and puts copies of each substring into the sarray.
*/
SARRAY *
sarrayCreateWordsFromString(const char *string)
{
char separators[] = " \n\t";
l_int32 i, nsub, size, inword;
SARRAY *sa;
PROCNAME("sarrayCreateWordsFromString");
if (!string)
return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
/* Find the number of words */
size = strlen(string);
nsub = 0;
inword = FALSE;
for (i = 0; i < size; i++) {
if (inword == FALSE &&
(string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
inword = TRUE;
nsub++;
}
else if (inword == TRUE &&
(string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
inword = FALSE;
}
}
if ((sa = sarrayCreate(nsub)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
sarraySplitString(sa, string, separators);
return sa;
}
/*!
* sarrayCreateLinesFromString()
*
* Input: string
* blankflag (0 to exclude blank lines; 1 to include)
* Return: sarray, or null on error
*
* Notes:
* (1) This finds the number of line substrings, creates an sarray of
* this size, and puts copies of each substring into the sarray.
*/
SARRAY *
sarrayCreateLinesFromString(char *string,
l_int32 blankflag)
{
l_int32 i, nsub, size, startptr;
char *cstring, *substring;
SARRAY *sa;
PROCNAME("sarrayCreateLinesFromString");
if (!string)
return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
/* find the number of lines */
size = strlen(string);
nsub = 0;
for (i = 0; i < size; i++) {
if (string[i] == '\n')
nsub++;
}
if ((sa = sarrayCreate(nsub)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
if (blankflag) { /* keep blank lines as null strings */
/* Make a copy for munging */
if ((cstring = stringNew(string)) == NULL)
return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
/* We'll insert nulls like strtok */
startptr = 0;
for (i = 0; i < size; i++) {
if (cstring[i] == '\n') {
cstring[i] = '\0';
if ((substring = stringNew(cstring + startptr)) == NULL)
return (SARRAY *)ERROR_PTR("substring not made",
procName, NULL);
sarrayAddString(sa, substring, L_INSERT);
/* fprintf(stderr, "substring = %s\n", substring); */
startptr = i + 1;
}
}
if (startptr < size) { /* no newline at end of last line */
if ((substring = stringNew(cstring + startptr)) == NULL)
return (SARRAY *)ERROR_PTR("substring not made",
procName, NULL);
sarrayAddString(sa, substring, L_INSERT);
/* fprintf(stderr, "substring = %s\n", substring); */
}
FREE(cstring);
}
else { /* remove blank lines; use strtok */
sarraySplitString(sa, string, "\n");
}
return sa;
}
/*!
* sarrayDestroy()
*
* Input: &sarray <to be nulled>
* Return: void
*
* Notes:
* (1) Decrements the ref count and, if 0, destroys the sarray.
* (2) Always nulls the input ptr.
*/
void
sarrayDestroy(SARRAY **psa)
{
l_int32 i;
SARRAY *sa;
PROCNAME("sarrayDestroy");
if (psa == NULL) {
L_WARNING("ptr address is NULL!", procName);
return;
}
if ((sa = *psa) == NULL)
return;
sarrayChangeRefcount(sa, -1);
if (sarrayGetRefcount(sa) <= 0) {
if (sa->array) {
for (i = 0; i < sa->n; i++)
FREE(sa->array[i]);
FREE(sa->array);
}
FREE(sa);
}
*psa = NULL;
return;
}
/*!
* sarrayCopy()
*
* Input: sarray
* Return: copy of sarray, or null on error
*/
SARRAY *
sarrayCopy(SARRAY *sa)
{
l_int32 i;
SARRAY *csa;
PROCNAME("sarrayCopy");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
if ((csa = sarrayCreate(sa->nalloc)) == NULL)
return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
for (i = 0; i < sa->n; i++)
sarrayAddString(csa, sa->array[i], L_COPY);
return csa;
}
/*!
* sarrayClone()
*
* Input: sarray
* Return: ptr to same sarray, or null on error
*/
SARRAY *
sarrayClone(SARRAY *sa)
{
PROCNAME("sarrayClone");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
sarrayChangeRefcount(sa, 1);
return sa;
}
/*!
* sarrayAddString()
*
* Input: sarray
* string (string to be added)
* copyflag (L_INSERT, L_COPY)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) Legacy usage decrees that we always use 0 to insert a string
* directly and 1 to insert a copy of the string. The
* enums for L_INSERT and L_COPY agree with this convention,
* and will not change in the future.
* (2) See usage comments at the top of this file.
*/
l_int32
sarrayAddString(SARRAY *sa,
char *string,
l_int32 copyflag)
{
l_int32 n;
PROCNAME("sarrayAddString");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if (!string)
return ERROR_INT("string not defined", procName, 1);
if (copyflag != L_INSERT && copyflag != L_COPY)
return ERROR_INT("invalid copyflag", procName, 1);
n = sarrayGetCount(sa);
if (n >= sa->nalloc)
sarrayExtendArray(sa);
if (copyflag == L_INSERT)
sa->array[n] = string;
else /* L_COPY */
sa->array[n] = stringNew(string);
sa->n++;
return 0;
}
/*!
* sarrayExtendArray()
*
* Input: sarray
* Return: 0 if OK, 1 on error
*/
l_int32
sarrayExtendArray(SARRAY *sa)
{
PROCNAME("sarrayExtendArray");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if ((sa->array = (char **)reallocNew((void **)&sa->array,
sizeof(char *) * sa->nalloc,
2 * sizeof(char *) * sa->nalloc)) == NULL)
return ERROR_INT("new ptr array not returned", procName, 1);
sa->nalloc *= 2;
return 0;
}
/*!
* sarrayRemoveString()
*
* Input: sarray
* index (of string within sarray)
* Return: removed string, or null on error
*/
char *
sarrayRemoveString(SARRAY *sa,
l_int32 index)
{
char *string;
char **array;
l_int32 i, n, nalloc;
PROCNAME("sarrayRemoveString");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
return (char *)ERROR_PTR("array not returned", procName, NULL);
if (index < 0 || index >= n)
return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
string = array[index];
/* If removed string is not at end of array, shift
* to fill in, maintaining original ordering.
* Note: if we didn't care about the order, we could
* put the last string array[n - 1] directly into the hole. */
for (i = index; i < n - 1; i++)
array[i] = array[i + 1];
sa->n--;
return string;
}
/*!
* sarrayClear()
*
* Input: sarray
* Return: 0 if OK; 1 on error
*/
l_int32
sarrayClear(SARRAY *sa)
{
l_int32 i;
PROCNAME("sarrayClear");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */
FREE(sa->array[i]);
sa->array[i] = NULL;
}
sa->n = 0;
return 0;
}
/*----------------------------------------------------------------------*
* Accessors *
*----------------------------------------------------------------------*/
/*!
* sarrayGetCount()
*
* Input: sarray
* Return: count, or 0 if no strings or on error
*/
l_int32
sarrayGetCount(SARRAY *sa)
{
PROCNAME("sarrayGetCount");
if (!sa)
return ERROR_INT("sa not defined", procName, 0);
return sa->n;
}
/*!
* sarrayGetArray()
*
* Input: sarray
* &nalloc (<optional return> number allocated string ptrs)
* &n (<optional return> number allocated strings)
* Return: ptr to string array, or null on error
*
* Notes:
* (1) Caution: the returned array is not a copy, so caller
* must not destroy it!
*/
char **
sarrayGetArray(SARRAY *sa,
l_int32 *pnalloc,
l_int32 *pn)
{
char **array;
PROCNAME("sarrayGetArray");
if (!sa)
return (char **)ERROR_PTR("sa not defined", procName, NULL);
array = sa->array;
if (pnalloc) *pnalloc = sa->nalloc;
if (pn) *pn = sa->n;
return array;
}
/*!
* sarrayGetString()
*
* Input: sarray
* index (to the index-th string)
* copyflag (L_NOCOPY or L_COPY)
* Return: string, or null on error
*
* Notes:
* (1) Legacy usage decrees that we always use 0 to get the
* pointer to the string itself, and 1 to get a copy of
* the string.
* (2) See usage comments at the top of this file.
* (3) To get a pointer to the string itself, use for copyflag:
* L_NOCOPY or 0 or FALSE
* To get a copy of the string, use for copyflag:
* L_COPY or 1 or TRUE
* The const values of L_NOCOPY and L_COPY are guaranteed not
* to change.
*/
char *
sarrayGetString(SARRAY *sa,
l_int32 index,
l_int32 copyflag)
{
PROCNAME("sarrayGetString");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
if (index < 0 || index >= sa->n)
return (char *)ERROR_PTR("index not valid", procName, NULL);
if (copyflag != L_NOCOPY && copyflag != L_COPY)
return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
if (copyflag == L_NOCOPY)
return sa->array[index];
else /* L_COPY */
return stringNew(sa->array[index]);
}
/*!
* sarrayGetRefCount()
*
* Input: sarray
* Return: refcount, or UNDEF on error
*/
l_int32
sarrayGetRefcount(SARRAY *sa)
{
PROCNAME("sarrayGetRefcount");
if (!sa)
return ERROR_INT("sa not defined", procName, UNDEF);
return sa->refcount;
}
/*!
* sarrayChangeRefCount()
*
* Input: sarray
* delta (change to be applied)
* Return: 0 if OK, 1 on error
*/
l_int32
sarrayChangeRefcount(SARRAY *sa,
l_int32 delta)
{
PROCNAME("sarrayChangeRefcount");
if (!sa)
return ERROR_INT("sa not defined", procName, UNDEF);
sa->refcount += delta;
return 0;
}
/*----------------------------------------------------------------------*
* Conversion to string *
*----------------------------------------------------------------------*/
/*!
* sarrayToString()
*
* Input: sarray
* addnlflag (flag: 0 adds nothing to each substring
* 1 adds '\n' to each substring
* 2 adds ' ' to each substring)
* Return: dest string, or null on error
*
* Notes:
* (1) Concatenates all the strings in the sarray, preserving
* all white space.
* (2) If addnlflag != 0, adds either a '\n' or a ' ' after
* each substring.
* (3) This function was NOT implemented as:
* for (i = 0; i < n; i++)
* strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
* Do you see why?
*/
char *
sarrayToString(SARRAY *sa,
l_int32 addnlflag)
{
PROCNAME("sarrayToString");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
return sarrayToStringRange(sa, 0, 0, addnlflag);
}
/*!
* sarrayToStringRange()
*
* Input: sarray
* first (index of first string to use; starts with 0)
* nstrings (number of strings to append into the result; use
* 0 to append to the end of the sarray)
* addnlflag (flag: 0 adds nothing to each substring
* 1 adds '\n' to each substring
* 2 adds ' ' to each substring)
* Return: dest string, or null on error
*
* Notes:
* (1) Concatenates the specified strings inthe sarray, preserving
* all white space.
* (2) If addnlflag != 0, adds either a '\n' or a ' ' after
* each substring.
* (3) If the sarray is empty, this returns a string with just
* the character corresponding to @addnlflag.
*/
char *
sarrayToStringRange(SARRAY *sa,
l_int32 first,
l_int32 nstrings,
l_int32 addnlflag)
{
char *dest, *src;
l_int32 n, i, last, size, index, len;
PROCNAME("sarrayToStringRange");
if (!sa)
return (char *)ERROR_PTR("sa not defined", procName, NULL);
if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
n = sarrayGetCount(sa);
/* Empty sa; return char corresponding to addnlflag only */
if (n == 0) {
if (first == 0) {
if (addnlflag == 0)
return stringNew("");
if (addnlflag == 1)
return stringNew("\n");
else /* addnlflag == 2) */
return stringNew(" ");
}
else
return (char *)ERROR_PTR("first not valid", procName, NULL);
}
if (first < 0 || first >= n)
return (char *)ERROR_PTR("first not valid", procName, NULL);
if (nstrings == 0 || (nstrings > n - first))
nstrings = n - first; /* no overflow */
last = first + nstrings - 1;
size = 0;
for (i = first; i <= last; i++)
size += strlen(sarrayGetString(sa, i, L_NOCOPY)) + 2;
if ((dest = (char *)CALLOC(size + 1, sizeof(char))) == NULL)
return (char *)ERROR_PTR("dest not made", procName, NULL);
index = 0;
for (i = first; i <= last; i++) {
src = sa->array[i];
len = strlen(src);
memcpy(dest + index, src, len);
index += len;
if (addnlflag == 1) {
dest[index] = '\n';
index++;
}
else if (addnlflag == 2) {
dest[index] = ' ';
index++;
}
}
return dest;
}
/*----------------------------------------------------------------------*
* Concatenate 2 sarrays *
*----------------------------------------------------------------------*/
/*!
* sarrayConcatenate()
*
* Input: sa1 (to be added to)
* sa2 (append to sa1)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) Copies of the strings in sarray2 are added to sarray1.
*/
l_int32
sarrayConcatenate(SARRAY *sa1,
SARRAY *sa2)
{
char *str;
l_int32 n, i;
PROCNAME("sarrayConcatenate");
if (!sa1)
return ERROR_INT("sa1 not defined", procName, 1);
if (!sa2)
return ERROR_INT("sa2 not defined", procName, 1);
n = sarrayGetCount(sa2);
for (i = 0; i < n; i++) {
str = sarrayGetString(sa2, i, L_NOCOPY);
sarrayAddString(sa1, str, L_COPY);
}
return 0;
}
/*!
* sarrayAppendRange()
*
* Input: sa1 (to be added to)
* sa2 (append specified range of strings in sa2 to sa1)
* start (index of first string of sa2 to append)
* end (index of last string of sa2 to append)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) Copies of the strings in sarray2 are added to sarray1.
* (2) The [start ... end] range is truncated if necessary.
*/
l_int32
sarrayAppendRange(SARRAY *sa1,
SARRAY *sa2,
l_int32 start,
l_int32 end)
{
char *str;
l_int32 n, i;
PROCNAME("sarrayAppendRange");
if (!sa1)
return ERROR_INT("sa1 not defined", procName, 1);
if (!sa2)
return ERROR_INT("sa2 not defined", procName, 1);
if (start < 0)
start = 0;
n = sarrayGetCount(sa2);
if (end >= n)
end = n - 1;
if (start > end)
return ERROR_INT("start > end", procName, 1);
for (i = start; i <= end; i++) {
str = sarrayGetString(sa2, i, L_NOCOPY);
sarrayAddString(sa1, str, L_COPY);
}
return 0;
}
/*----------------------------------------------------------------------*
* Convert word sarray to line sarray *
*----------------------------------------------------------------------*/
/*!
* sarrayConvertWordsToLines()
*
* Input: sa (sa of individual words)
* linesize (max num of chars in each line)
* Return: saout (sa of formatted lines), or null on error
*
* This is useful for re-typesetting text to a specific maximum
* line length. The individual words in the input sarray
* are concatenated into textlines. An input word string of zero
* length is taken to be a paragraph separator. Each time
* such a string is found, the current line is ended and
* a new line is also produced that contains just the
* string of zero length (""). When the output sarray
* of lines is eventually converted to a string with newlines
* (typically) appended to each line string, the empty
* strings are just converted to newlines, producing the visible
* paragraph separation.
*
* What happens when a word is larger than linesize?
* We write it out as a single line anyway! Words preceding
* or following this long word are placed on lines preceding
* or following the line with the long word. Why this choice?
* Long "words" found in text documents are typically URLs, and
* it's often desirable not to put newlines in the middle of a URL.
* The text display program (e.g., text editor) will typically
* wrap the long "word" to fit in the window.
*/
SARRAY *
sarrayConvertWordsToLines(SARRAY *sa,
l_int32 linesize)
{
char *wd, *strl;
char emptystring[] = "";
l_int32 n, i, len, totlen;
SARRAY *sal, *saout;
PROCNAME("sarrayConvertWordsToLines");
if (!sa)
return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
if ((saout = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("saout not defined", procName, NULL);
n = sarrayGetCount(sa);
totlen = 0;
sal = NULL;
for (i = 0; i < n; i++) {
if (!sal) {
if ((sal = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
}
wd = sarrayGetString(sa, i, L_NOCOPY);
len = strlen(wd);
if (len == 0) { /* end of paragraph: end line & insert blank line */
if (totlen > 0) {
strl = sarrayToString(sal, 2);
sarrayAddString(saout, strl, L_INSERT);
}
sarrayAddString(saout, emptystring, L_COPY);
sarrayDestroy(&sal);
totlen = 0;
}
else if (totlen == 0 && len + 1 > linesize) { /* long word! */
sarrayAddString(saout, wd, L_COPY); /* copy to one line */
}
else if (totlen + len + 1 > linesize) { /* end line & start new one */
strl = sarrayToString(sal, 2);
sarrayAddString(saout, strl, L_INSERT);
sarrayDestroy(&sal);
if ((sal = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("sal not made", procName, NULL);
sarrayAddString(sal, wd, L_COPY);
totlen = len + 1;
}
else { /* add to current line */
sarrayAddString(sal, wd, L_COPY);
totlen += len + 1;
}
}
if (totlen > 0) { /* didn't end with blank line; output last line */
strl = sarrayToString(sal, 2);
sarrayAddString(saout, strl, L_INSERT);
sarrayDestroy(&sal);
}
return saout;
}
/*----------------------------------------------------------------------*
* Split string on separator list *
*----------------------------------------------------------------------*/
/*
* sarraySplitString()
*
* Input: sa (to append to; typically empty initially)
* str (string to split; not changed)
* separators (characters that split input string)
* Return: 0 if OK, 1 on error.
*
* Notes:
* (1) This uses strtokSafe(). See the notes there in utils.c.
*/
l_int32
sarraySplitString(SARRAY *sa,
const char *str,
const char *separators)
{
char *cstr, *substr, *saveptr;
PROCNAME("sarraySplitString");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if (!str)
return ERROR_INT("str not defined", procName, 1);
if (!separators)
return ERROR_INT("separators not defined", procName, 1);
cstr = stringNew(str); /* preserves const-ness of input str */
substr = strtokSafe(cstr, separators, &saveptr);
if (substr)
sarrayAddString(sa, substr, L_INSERT);
while ((substr = strtokSafe(NULL, separators, &saveptr)))
sarrayAddString(sa, substr, L_INSERT);
FREE(cstr);
return 0;
}
/*----------------------------------------------------------------------*
* Filter sarray *
*----------------------------------------------------------------------*/
/*!
* sarraySelectBySubstring()
*
* Input: sain (input sarray)
* substr (<optional> substring for matching; can be NULL)
* Return: saout (output sarray, filtered with substring) or null on error
*
* Notes:
* (1) This selects all strings in sain that have substr as a substring.
* Note that we can't use strncmp() because we're looking for
* a match to the substring anywhere within each filename.
* (2) If substr == NULL, returns a copy of the sarray.
*/
SARRAY *
sarraySelectBySubstring(SARRAY *sain,
const char *substr)
{
char *str;
l_int32 n, i, offset, found;
SARRAY *saout;
PROCNAME("sarraySelectBySubstring");
if (!sain)
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
n = sarrayGetCount(sain);
if (!substr || n == 0)
return sarrayCopy(sain);
saout = sarrayCreate(n);
for (i = 0; i < n; i++) {
str = sarrayGetString(sain, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (found)
sarrayAddString(saout, str, L_COPY);
}
return saout;
}
/*!
* sarrayParseRange()
*
* Input: sa (input sarray)
* start (index to start range search)
* &actualstart (<return> index of actual start; may be > 'start')
* &end (<return> index of end)
* &newstart (<return> index of start of next range)
* substr (substring for matching at beginning of string)
* loc (byte offset within the string for the pattern; use
* -1 if the location does not matter);
* Return: 0 if valid range found; 1 otherwise
*
* Notes:
* (1) This finds the range of the next set of strings in SA,
* beginning the search at 'start', that does NOT have
* the substring 'substr' either at the indicated location
* in the string or anywhere in the string. The input
* variable 'loc' is the specified offset within the string;
* use -1 to indicate 'anywhere in the string'.
* (2) Always check the return value to verify that a valid range
* was found.
* (3) If a valid range is not found, the values of actstart,
* end and newstart are all set to the size of sa.
* (4) If this is the last valid range, newstart returns the value n.
* In use, this should be tested before calling the function.
* (5) Usage example. To find all the valid ranges in a file
* where the invalid lines begin with two dashes, copy each
* line in the file to a string in an sarray, and do:
* start = 0;
* while (!sarrayParseRange(sa, start, &actstart, &end, &start,
* "--", 0))
* fprintf(stderr, "start = %d, end = %d\n", actstart, end);
*/
l_int32
sarrayParseRange(SARRAY *sa,
l_int32 start,
l_int32 *pactualstart,
l_int32 *pend,
l_int32 *pnewstart,
const char *substr,
l_int32 loc)
{
char *str;
l_int32 n, i, offset, found;
PROCNAME("sarrayParseRange");
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if (!pactualstart || !pend || !pnewstart)
return ERROR_INT("not all range addresses defined", procName, 1);
n = sarrayGetCount(sa);
*pactualstart = *pend = *pnewstart = n;
if (!substr)
return ERROR_INT("substr not defined", procName, 1);
/* Look for the first string without the marker */
if (start < 0 || start >= n)
return 1;
for (i = start; i < n; i++) {
str = sarrayGetString(sa, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (loc < 0) {
if (!found) break;
} else {
if (!found || offset != loc) break;
}
}
start = i;
if (i == n) /* couldn't get started */
return 1;
/* Look for the last string without the marker */
*pactualstart = start;
for (i = start + 1; i < n; i++) {
str = sarrayGetString(sa, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (loc < 0) {
if (found) break;
} else {
if (found && offset == loc) break;
}
}
*pend = i - 1;
start = i;
if (i == n) /* no further range */
return 0;
/* Look for the first string after *pend without the marker.
* This will start the next run of strings, if it exists. */
for (i = start; i < n; i++) {
str = sarrayGetString(sa, i, L_NOCOPY);
arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
strlen(substr), &offset, &found);
if (loc < 0) {
if (!found) break;
} else {
if (!found || offset != loc) break;
}
}
if (i < n)
*pnewstart = i;
return 0;
}
/*----------------------------------------------------------------------*
* Sort *
*----------------------------------------------------------------------*/
/*!
* sarraySort()
*
* Input: saout (output sarray; can be NULL or equal to sain)
* sain (input sarray)
* sortorder (L_SORT_INCREASING or L_SORT_DECREASING)
* Return: saout (output sarray, sorted by ascii value), or null on error
*
* Notes:
* (1) Set saout = sain for in-place; otherwise, set naout = NULL.
* (2) Shell sort, modified from K&R, 2nd edition, p.62.
* Slow but simple O(n logn) sort.
*/
SARRAY *
sarraySort(SARRAY *saout,
SARRAY *sain,
l_int32 sortorder)
{
char **array;
char *tmp;
l_int32 n, i, j, gap;
PROCNAME("sarraySort");
if (!sain)
return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
/* Make saout if necessary; otherwise do in-place */
if (!saout)
saout = sarrayCopy(sain);
else if (sain != saout)
return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL);
array = saout->array; /* operate directly on the array */
n = sarrayGetCount(saout);
/* Shell sort */
for (gap = n/2; gap > 0; gap = gap / 2) {
for (i = gap; i < n; i++) {
for (j = i - gap; j >= 0; j -= gap) {
if ((sortorder == L_SORT_INCREASING &&
stringCompareLexical(array[j], array[j + gap])) ||
(sortorder == L_SORT_DECREASING &&
stringCompareLexical(array[j + gap], array[j])))
{
tmp = array[j];
array[j] = array[j + gap];
array[j + gap] = tmp;
}
}
}
}
return saout;
}
/*!
* stringCompareLexical()
*
* Input: str1
* str2
* Return: 1 if str1 > str2 (lexically); 0 otherwise
*
* Notes:
* (1) If the lexical values are identical, return a 0, to
* indicate that no swapping is required to sort the strings.
*/
l_int32
stringCompareLexical(const char *str1,
const char *str2)
{
l_int32 i, len1, len2, len;
PROCNAME("sarrayCompareLexical");
if (!str1)
return ERROR_INT("str1 not defined", procName, 1);
if (!str2)
return ERROR_INT("str2 not defined", procName, 1);
len1 = strlen(str1);
len2 = strlen(str2);
len = L_MIN(len1, len2);
for (i = 0; i < len; i++) {
if (str1[i] == str2[i])
continue;
if (str1[i] > str2[i])
return 1;
else
return 0;
}
if (len1 > len2)
return 1;
else
return 0;
}
/*----------------------------------------------------------------------*
* Serialize for I/O *
*----------------------------------------------------------------------*/
/*!
* sarrayRead()
*
* Input: filename
* Return: sarray, or null on error
*/
SARRAY *
sarrayRead(const char *filename)
{
FILE *fp;
SARRAY *sa;
PROCNAME("sarrayRead");
if (!filename)
return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
if ((fp = fopenReadStream(filename)) == NULL)
return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
if ((sa = sarrayReadStream(fp)) == NULL) {
fclose(fp);
return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
}
fclose(fp);
return sa;
}
/*!
* sarrayReadStream()
*
* Input: stream
* Return: sarray, or null on error
*
* Notes:
* (1) We store the size of each string along with the string.
* (2) This allows a string to have embedded newlines. By reading
* the entire string, as determined by its size, we are
* not affected by any number of embedded newlines.
*/
SARRAY *
sarrayReadStream(FILE *fp)
{
char *stringbuf;
l_int32 i, n, size, index, bufsize, ret, version;
SARRAY *sa;
PROCNAME("sarrayReadStream");
if (!fp)
return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
ret = fscanf(fp, "\nSarray Version %d\n", &version);
if (ret != 1)
return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
if (version != SARRAY_VERSION_NUMBER)
return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
fscanf(fp, "Number of strings = %d\n", &n);
if ((sa = sarrayCreate(n)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
bufsize = L_BUF_SIZE + 1;
if ((stringbuf = (char *)CALLOC(bufsize, sizeof(char))) == NULL)
return (SARRAY *)ERROR_PTR("stringbuf not made", procName, NULL);
for (i = 0; i < n; i++) {
/* Get the size of the stored string */
fscanf(fp, "%d[%d]:", &index, &size);
/* Expand the string buffer if necessary */
if (size > bufsize - 5) {
FREE(stringbuf);
bufsize = (l_int32)(1.5 * size);
stringbuf = (char *)CALLOC(bufsize, sizeof(char));
}
/* Read the stored string, plus leading spaces and trailing \n */
fread(stringbuf, 1, size + 3, fp);
/* Remove the \n that was added by sarrayWriteStream() */
stringbuf[size + 2] = '\0';
/* Copy it in, skipping the 2 leading spaces */
sarrayAddString(sa, stringbuf + 2, L_COPY);
}
fscanf(fp, "\n");
FREE(stringbuf);
return sa;
}
/*!
* sarrayWrite()
*
* Input: filename
* sarray
* Return: 0 if OK; 1 on error
*/
l_int32
sarrayWrite(const char *filename,
SARRAY *sa)
{
FILE *fp;
PROCNAME("sarrayWrite");
if (!filename)
return ERROR_INT("filename not defined", procName, 1);
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if ((fp = fopen(filename, "w")) == NULL)
return ERROR_INT("stream not opened", procName, 1);
if (sarrayWriteStream(fp, sa))
return ERROR_INT("sa not written to stream", procName, 1);
fclose(fp);
return 0;
}
/*!
* sarrayWriteStream()
*
* Input: stream
* sarray
* Returns 0 if OK; 1 on error
*
* Notes:
* (1) This appends a '\n' to each string, which is stripped
* off by sarrayReadStream().
*/
l_int32
sarrayWriteStream(FILE *fp,
SARRAY *sa)
{
l_int32 i, n, len;
PROCNAME("sarrayWriteStream");
if (!fp)
return ERROR_INT("stream not defined", procName, 1);
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
n = sarrayGetCount(sa);
fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
fprintf(fp, "Number of strings = %d\n", n);
for (i = 0; i < n; i++) {
len = strlen(sa->array[i]);
fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]);
}
fprintf(fp, "\n");
return 0;
}
/*!
* sarrayAppend()
*
* Input: filename
* sarray
* Return: 0 if OK; 1 on error
*/
l_int32
sarrayAppend(const char *filename,
SARRAY *sa)
{
FILE *fp;
PROCNAME("sarrayAppend");
if (!filename)
return ERROR_INT("filename not defined", procName, 1);
if (!sa)
return ERROR_INT("sa not defined", procName, 1);
if ((fp = fopen(filename, "a")) == NULL)
return ERROR_INT("stream not opened", procName, 1);
if (sarrayWriteStream(fp, sa))
return ERROR_INT("sa not appended to stream", procName, 1);
fclose(fp);
return 0;
}
/*---------------------------------------------------------------------*
* Directory filenames *
*---------------------------------------------------------------------*/
/*!
* getSortedPathnamesInDirectory()
*
* Input: directory name
* substr (<optional> substring filter on filenames; can be NULL)
* firstpage (0-based)
* npages (use 0 for all to the end)
* Return: sarray of sorted pathnames, or NULL on error
*
* Notes:
* (1) If 'substr' is not NULL, only filenames that contain
* the substring can be returned. If 'substr' is NULL,
* none of the filenames are filtered out.
* (2) The files in the directory, after optional filtering by
* the substring, are lexically sorted in increasing order.
* The full pathnames are returned for the requested sequence.
* If no files are found after filtering, returns an empty sarray.
*/
SARRAY *
getSortedPathnamesInDirectory(const char *dirname,
const char *substr,
l_int32 firstpage,
l_int32 npages)
{
char *fname, *fullname;
l_int32 i, nfiles, lastpage;
SARRAY *sa, *safiles, *saout;
PROCNAME("getSortedPathnamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
if ((sa = getFilenamesInDirectory(dirname)) == NULL)
return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
safiles = sarraySelectBySubstring(sa, substr);
sarrayDestroy(&sa);
nfiles = sarrayGetCount(safiles);
if (nfiles == 0) {
L_WARNING("no files found", procName);
return safiles;
}
sarraySort(safiles, safiles, L_SORT_INCREASING);
firstpage = L_MIN(L_MAX(firstpage, 0), nfiles - 1);
if (npages == 0)
npages = nfiles - firstpage;
lastpage = L_MIN(firstpage + npages - 1, nfiles - 1);
saout = sarrayCreate(lastpage - firstpage + 1);
for (i = firstpage; i <= lastpage; i++) {
fname = sarrayGetString(safiles, i, L_NOCOPY);
fullname = genPathname(dirname, fname);
sarrayAddString(saout, fullname, L_INSERT);
}
sarrayDestroy(&safiles);
return saout;
}
/*!
* getFilenamesInDirectory()
*
* Input: directory name
* Return: sarray of file names, or NULL on error
*
* Notes:
* (1) The versions compiled under unix and cygwin use the POSIX C
* library commands for handling directories. For windows,
* there is a separate implementation.
* (2) It returns an array of filename tails; i.e., only the part of
* the path after the last slash.
* (3) Use of the d_type field of dirent is not portable:
* "According to POSIX, the dirent structure contains a field
* char d_name[] of unspecified size, with at most NAME_MAX
* characters preceding the terminating null character. Use
* of other fields will harm the portability of your programs."
* (4) As a consequence of (3), we note several things:
* - MINGW doesn't have a d_type member.
* - Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
* for d_type from all files.
* On these systems, this function will return directories
* (except for '.' and '..', which are eliminated using
* the d_name field).
*/
#ifndef COMPILER_MSVC
SARRAY *
getFilenamesInDirectory(const char *dirname)
{
char *name;
l_int32 len;
SARRAY *safiles;
DIR *pdir;
struct dirent *pdirentry;
PROCNAME("getFilenamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
if ((safiles = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
if ((pdir = opendir(dirname)) == NULL)
return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
while ((pdirentry = readdir(pdir))) {
/* It's nice to ignore directories. For this it is necessary to
* define _BSD_SOURCE in the CC command, because the DT_DIR
* flag is non-standard. */
#if !defined(__MINGW32__) && !defined(_CYGWIN_ENVIRON) && !defined(__SOLARIS__)
if (pdirentry->d_type == DT_DIR)
continue;
#endif
/* Filter out "." and ".." if they're passed through */
name = pdirentry->d_name;
len = strlen(name);
if (len == 1 && name[len - 1] == '.') continue;
if (len == 2 && name[len - 1] == '.' && name[len - 2] == '.') continue;
sarrayAddString(safiles, name, L_COPY);
}
closedir(pdir);
return safiles;
}
#else /* COMPILER_MSVC */
/* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
#include <windows.h>
#include <tchar.h>
SARRAY *
getFilenamesInDirectory(const char *dirname)
{
SARRAY *safiles;
WIN32_FIND_DATAA ffd;
size_t length_of_path;
CHAR szDir[MAX_PATH]; /* MAX_PATH is defined in stdlib.h */
HANDLE hFind = INVALID_HANDLE_VALUE;
PROCNAME("getFilenamesInDirectory");
if (!dirname)
return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
length_of_path = strlen(dirname);
if (length_of_path > (MAX_PATH - 2))
return (SARRAY *)ERROR_PTR("dirname is to long", procName, NULL);
strncpy(szDir, dirname, MAX_PATH);
szDir[MAX_PATH - 1] = '\0';
strncat(szDir, TEXT("\\*"), MAX_PATH - strlen(szDir));
if ((safiles = sarrayCreate(0)) == NULL)
return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
hFind = FindFirstFileA(szDir, &ffd);
if (INVALID_HANDLE_VALUE == hFind) {
sarrayDestroy(&safiles);
return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
}
while (FindNextFileA(hFind, &ffd) != 0) {
if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */
continue;
sarrayAddString(safiles, ffd.cFileName, L_COPY);
}
FindClose(hFind);
return safiles;
}
#endif /* COMPILER_MSVC */