hyphen.c - platform/external/hyphenation - Git at Google

 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
  * licenses follows.
  */

 /* LibHnj - a library for high quality hyphenation and justification
  * Copyright (C) 1998 Raph Levien,
  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
  *           (C) 2006, 2007, 2008 László Németh (nemeth at OOo)
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Library General Public License for more details.
  *
  * You should have received a copy of the GNU Library General Public
  * License along with this library; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA  02111-1307  USA.
  */

 /*
  * The contents of this file are subject to the Mozilla Public License
  * Version 1.0 (the "MPL"); you may not use this file except in
  * compliance with the MPL.  You may obtain a copy of the MPL at
  * http://www.mozilla.org/MPL/
  *
  * Software distributed under the MPL is distributed on an "AS IS" basis,
  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
  * for the specific language governing rights and limitations under the
  * MPL.
  *
  */
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <stdlib.h> /* for NULL, malloc */
 #include <stdio.h>  /* for fprintf */
 #include <string.h> /* for strdup */
 #include <unistd.h> /* for close */

 #define noVERBOSE

 #include "hnjalloc.h"
 #include "hyphen.h"

 static char *
 hnj_strdup (const char *s)
 {
     char *new;
     int l;

     l = strlen (s);
     new = hnj_malloc (l + 1);
     memcpy (new, s, l);
     new[l] = 0;
     return new;
 }

 /* remove cross-platform text line end characters */
 void hnj_strchomp(char * s)
 {
     int k = strlen(s);
     if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
     if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
 }

 /* a little bit of a hash table implementation. This simply maps strings
    to state numbers */

 typedef struct _HashTab HashTab;
 typedef struct _HashEntry HashEntry;

 /* A cheap, but effective, hack. */
 #define HASH_SIZE 31627

 struct _HashTab {
     HashEntry *entries[HASH_SIZE];
 };

 struct _HashEntry {
     HashEntry *next;
     char *key;
     int val;
 };

 /* a char* hash function from ASU - adapted from Gtk+ */
 static unsigned int
 hnj_string_hash (const char *s)
 {
     const char *p;
     unsigned int h=0, g;
     for(p = s; *p != '\0'; p += 1) {
         h = ( h << 4 ) + *p;
         if ( ( g = h & 0xf0000000 ) ) {
             h = h ^ (g >> 24);
             h = h ^ g;
         }
     }
     return h /* % M */;
 }

 static HashTab *
 hnj_hash_new (void)
 {
     HashTab *hashtab;
     int i;

     hashtab = hnj_malloc (sizeof(HashTab));
     for (i = 0; i < HASH_SIZE; i++)
         hashtab->entries[i] = NULL;

     return hashtab;
 }

 static void
 hnj_hash_free (HashTab *hashtab)
 {
     int i;
     HashEntry *e, *next;

     for (i = 0; i < HASH_SIZE; i++)
         for (e = hashtab->entries[i]; e; e = next)
         {
             next = e->next;
             hnj_free (e->key);
             hnj_free (e);
         }

     hnj_free (hashtab);
 }

 /* assumes that key is not already present! */
 static void
 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
 {
     int i;
     HashEntry *e;

     i = hnj_string_hash (key) % HASH_SIZE;
     e = hnj_malloc (sizeof(HashEntry));
     e->next = hashtab->entries[i];
     e->key = hnj_strdup (key);
     e->val = val;
     hashtab->entries[i] = e;
 }

 /* return val if found, otherwise -1 */
 static int
 hnj_hash_lookup (HashTab *hashtab, const char *key)
 {
     int i;
     HashEntry *e;
     i = hnj_string_hash (key) % HASH_SIZE;
     for (e = hashtab->entries[i]; e; e = e->next)
         if (!strcmp (key, e->key))
             return e->val;
     return -1;
 }

 /* Get the state number, allocating a new state if necessary. */
 static int
 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
 {
     int state_num;

     state_num = hnj_hash_lookup (hashtab, string);

     if (state_num >= 0)
         return state_num;

     hnj_hash_insert (hashtab, string, dict->num_states);
     /* predicate is true if dict->num_states is a power of two */
     if (!(dict->num_states & (dict->num_states - 1)))
     {
         dict->states = hnj_realloc (dict->states,
             (dict->num_states << 1) *
             sizeof(HyphenState));
     }
     dict->states[dict->num_states].match = NULL;
     dict->states[dict->num_states].repl = NULL;
     dict->states[dict->num_states].fallback_state = -1;
     dict->states[dict->num_states].num_trans = 0;
     dict->states[dict->num_states].trans = NULL;
     return dict->num_states++;
 }

 /* add a transition from state1 to state2 through ch - assumes that the
    transition does not already exist */
 static void
 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
 {
     int num_trans;

     num_trans = dict->states[state1].num_trans;
     if (num_trans == 0)
     {
         dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
     }
     else if (!(num_trans & (num_trans - 1)))
     {
         dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
             (num_trans << 1) *
             sizeof(HyphenTrans));
     }
     dict->states[state1].trans[num_trans].ch = ch;
     dict->states[state1].trans[num_trans].new_state = state2;
     dict->states[state1].num_trans++;
 }

 #ifdef VERBOSE
 HashTab *global;

 static char *
 get_state_str (int state)
 {
     int i;
     HashEntry *e;

     for (i = 0; i < HASH_SIZE; i++)
         for (e = global->entries[i]; e; e = e->next)
             if (e->val == state)
                 return e->key;
     return NULL;
 }
 #endif

 // Get a line from the dictionary contents.
 static char *
 get_line (char *s, int size, const char *dict_contents, int dict_length,
     int *dict_ptr)
 {
     int len = 0;
     while (len < (size - 1) && *dict_ptr < dict_length) {
         s[len++] = *(dict_contents + *dict_ptr);
         (*dict_ptr)++;
         if (s[len - 1] == '\n')
             break;
     }
     s[len] = '\0';
     if (len > 0) {
         return s;
     } else {
         return NULL;
     }
 }

 HyphenDict *
 hnj_hyphen_load (const char *fn)
 {
     if (fn == NULL)
         return NULL;
     const int fd = open(fn, O_RDONLY);
     if (fd == -1)
         return NULL;
     struct stat sb;
     if (fstat(fd, &sb) == -1)  {  /* To obtain file size */
         close(fd);
         return NULL;
     }

     const char *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
     if (addr == MAP_FAILED) {
         close(fd);
         return NULL;
     }
     HyphenDict *dict = hnj_hyphen_load_from_buffer(addr, sb.st_size);
     munmap((void *)addr, sb.st_size);
     close(fd);

     return dict;
 }

 HyphenDict *
 hnj_hyphen_load_from_buffer (const char *dict_contents, int dict_length)
 {
     HyphenDict *dict[2];
     HashTab *hashtab;
     char buf[MAX_CHARS];
     char word[MAX_CHARS];
     char pattern[MAX_CHARS];
     char * repl;
     signed char replindex;
     signed char replcut;
     int state_num = 0, last_state;
     int i, j, k;
     char ch;
     int found;
     HashEntry *e;
     int nextlevel = 0;

     if (dict_contents == NULL)
         return NULL;

     int dict_ptr = 0;
 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
     for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
         hashtab = hnj_hash_new ();
 #ifdef VERBOSE
         global = hashtab;
 #endif
         hnj_hash_insert (hashtab, "", 0);
         dict[k] = hnj_malloc (sizeof(HyphenDict));
         dict[k]->num_states = 1;
         dict[k]->states = hnj_malloc (sizeof(HyphenState));
         dict[k]->states[0].match = NULL;
         dict[k]->states[0].repl = NULL;
         dict[k]->states[0].fallback_state = -1;
         dict[k]->states[0].num_trans = 0;
         dict[k]->states[0].trans = NULL;
         dict[k]->nextlevel = NULL;
         dict[k]->lhmin = 0;
         dict[k]->rhmin = 0;
         dict[k]->clhmin = 0;
         dict[k]->crhmin = 0;

         /* read in character set info */
         if (k == 0) {
             for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
             get_line(dict[k]->cset, sizeof(dict[k]->cset), dict_contents,
                 dict_length, &dict_ptr);
             for (i=0;i<MAX_NAME;i++)
                 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
                     dict[k]->cset[i] = 0;
             dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
         } else {
             strcpy(dict[k]->cset, dict[0]->cset);
             dict[k]->utf8 = dict[0]->utf8;
         }

         while (get_line(buf, sizeof(buf), dict_contents, dict_length,
                 &dict_ptr) != NULL)
         {
             if (buf[0] != '%')
             {
                 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
                     nextlevel = 1;
                     break;
                 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
                     dict[k]->lhmin = atoi(buf + 13);
                     continue;
                 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
                     dict[k]->rhmin = atoi(buf + 14);
                     continue;
                 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
                     dict[k]->clhmin = atoi(buf + 21);
                     continue;
                 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
                     dict[k]->crhmin = atoi(buf + 22);
                     continue;
                 }
                 j = 0;
                 pattern[j] = '0';
                 repl = strchr(buf, '/');
                 replindex = 0;
                 replcut = 0;
                 if (repl) {
                     char * index = strchr(repl + 1, ',');
                     *repl = '\0';
                     if (index) {
                         char * index2 = strchr(index + 1, ',');
                         *index = '\0';
                         if (index2) {
                             *index2 = '\0';
                             replindex = (signed char) atoi(index + 1) - 1;
                             replcut = (signed char) atoi(index2 + 1);
                         }
                     } else {
                         hnj_strchomp(repl + 1);
                         replindex = 0;
                         replcut = strlen(buf);
                     }
                     repl = hnj_strdup(repl + 1);
                 }
                 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
                 {
                     if (buf[i] >= '0' && buf[i] <= '9')
                         pattern[j] = buf[i];
                     else
                     {
                         word[j] = buf[i];
                         pattern[++j] = '0';
                     }
                 }
                 word[j] = '\0';
                 pattern[j + 1] = '\0';

                 i = 0;
                 if (!repl) {
                     /* Optimize away leading zeroes */
                     for (; pattern[i] == '0'; i++);
                 } else {
                     if (*word == '.') i++;
                     /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
                     if (dict[k]->utf8) {
                         int pu = -1;        /* unicode character position */
                         int ps = -1;        /* unicode start position (original replindex) */
                         int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
                         for (; pc < (strlen(word) + 1); pc++) {
                             /* beginning of an UTF-8 character (not '10' start bits) */
                             if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
                             if ((ps < 0) && (replindex == pu)) {
                                 ps = replindex;
                                 replindex = pc;
                             }
                             if ((ps >= 0) && ((pu - ps) == replcut)) {
                                 replcut = (pc - replindex);
                                 break;
                             }
                         }
                         if (*word == '.') replindex--;
                     }
                 }

 #ifdef VERBOSE
                 printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
 #endif
                 found = hnj_hash_lookup (hashtab, word);
                 state_num = hnj_get_state (dict[k], hashtab, word);
                 dict[k]->states[state_num].match = hnj_strdup (pattern + i);
                 dict[k]->states[state_num].repl = repl;
                 dict[k]->states[state_num].replindex = replindex;
                 if (!replcut) {
                     dict[k]->states[state_num].replcut = strlen(word);
                 } else {
                     dict[k]->states[state_num].replcut = replcut;
                 }

                 /* now, put in the prefix transitions */
                 for (; found < 0 ;j--)
                 {
                     last_state = state_num;
                     ch = word[j - 1];
                     word[j - 1] = '\0';
                     found = hnj_hash_lookup (hashtab, word);
                     state_num = hnj_get_state (dict[k], hashtab, word);
                     hnj_add_trans (dict[k], state_num, last_state, ch);
                 }
             }
         }

         /* Could do unioning of matches here (instead of the preprocessor script).
            If we did, the pseudocode would look something like this:

            foreach state in the hash table
            foreach i = [1..length(state) - 1]
            state to check is substr (state, i)
            look it up
            if found, and if there is a match, union the match in.

            It's also possible to avoid the quadratic blowup by doing the
            search in order of increasing state string sizes - then you
            can break the loop after finding the first match.

            This step should be optional in any case - if there is a
            preprocessed rule table, it's always faster to use that.

         */

         /* put in the fallback states */
         for (i = 0; i < HASH_SIZE; i++)
             for (e = hashtab->entries[i]; e; e = e->next)
             {
                 if (*(e->key)) for (j = 1; 1; j++)
                                {
                                    state_num = hnj_hash_lookup (hashtab, e->key + j);
                                    if (state_num >= 0)
                                        break;
                                }
                 /* KBH: FIXME state 0 fallback_state should always be -1? */
                 if (e->val)
                     dict[k]->states[e->val].fallback_state = state_num;
             }
 #ifdef VERBOSE
         for (i = 0; i < HASH_SIZE; i++)
             for (e = hashtab->entries[i]; e; e = e->next)
             {
                 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
                     dict[k]->states[e->val].fallback_state);
                 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
                     printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
                         dict[k]->states[e->val].trans[j].new_state);
             }
 #endif

 #ifndef VERBOSE
         hnj_hash_free (hashtab);
 #endif
         state_num = 0;
     }
     if (k == 2) dict[0]->nextlevel = dict[1];
     return dict[0];
 }

 void hnj_hyphen_free (HyphenDict *dict)
 {
     int state_num;
     HyphenState *hstate;

     for (state_num = 0; state_num < dict->num_states; state_num++)
     {
         hstate = &dict->states[state_num];
         if (hstate->match)
             hnj_free (hstate->match);
         if (hstate->repl)
             hnj_free (hstate->repl);
         if (hstate->trans)
             hnj_free (hstate->trans);
     }
     if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);

     hnj_free (dict->states);

     hnj_free (dict);
 }

 #define MAX_WORD 256

 int hnj_hyphen_hyphenate (HyphenDict *dict,
     const char *word, int word_size,
     char *hyphens)
 {
     char prep_word_buf[MAX_WORD];
     char *prep_word;
     int i, j, k;
     int state;
     char ch;
     HyphenState *hstate;
     char *match;
     int offset;

     if (word_size + 3 < MAX_WORD)
         prep_word = prep_word_buf;
     else
         prep_word = hnj_malloc (word_size + 3);

     j = 0;
     prep_word[j++] = '.';

     for (i = 0; i < word_size; i++)
         prep_word[j++] = word[i];

     prep_word[j++] = '.';
     prep_word[j] = '\0';

     for (i = 0; i < j; i++)
         hyphens[i] = '0';

 #ifdef VERBOSE
     printf ("prep_word = %s\n", prep_word);
 #endif

     /* now, run the finite state machine */
     state = 0;
     for (i = 0; i < j; i++)
     {
         ch = prep_word[i];
         for (;;)
         {

             if (state == -1) {
                 /* return 1; */
                 /*  KBH: FIXME shouldn't this be as follows? */
                 state = 0;
                 goto try_next_letter;
             }

 #ifdef VERBOSE
             char *state_str;
             state_str = get_state_str (state);

             for (k = 0; k < i - strlen (state_str); k++)
                 putchar (' ');
             printf ("%s", state_str);
 #endif

             hstate = &dict->states[state];
             for (k = 0; k < hstate->num_trans; k++)
                 if (hstate->trans[k].ch == ch)
                 {
                     state = hstate->trans[k].new_state;
                     goto found_state;
                 }
             state = hstate->fallback_state;
 #ifdef VERBOSE
             printf (" falling back, fallback_state %d\n", state);
 #endif
         }
       found_state:
 #ifdef VERBOSE
         printf ("found state %d\n",state);
 #endif
         /* Additional optimization is possible here - especially,
            elimination of trailing zeroes from the match. Leading zeroes
            have already been optimized. */
         match = dict->states[state].match;
         /* replacing rules not handled by hyphen_hyphenate() */
         if (match && !dict->states[state].repl)
         {
             offset = i + 1 - strlen (match);
 #ifdef VERBOSE
             for (k = 0; k < offset; k++)
                 putchar (' ');
             printf ("%s\n", match);
 #endif
             /* This is a linear search because I tried a binary search and
                found it to be just a teeny bit slower. */
             for (k = 0; match[k]; k++)
                 if (hyphens[offset + k] < match[k])
                     hyphens[offset + k] = match[k];
         }

         /* KBH: we need this to make sure we keep looking in a word */
         /* for patterns even if the current character is not known in state 0 */
         /* since patterns for hyphenation may occur anywhere in the word */
       try_next_letter: ;

     }
 #ifdef VERBOSE
     for (i = 0; i < j; i++)
         putchar (hyphens[i]);
     putchar ('\n');
 #endif

     for (i = 0; i < j - 4; i++)
 #if 0
         if (hyphens[i + 1] & 1)
             hyphens[i] = '-';
 #else
     hyphens[i] = hyphens[i + 1];
 #endif
     hyphens[0] = '0';
     for (; i < word_size; i++)
         hyphens[i] = '0';
     hyphens[word_size] = '\0';

     if (prep_word != prep_word_buf)
         hnj_free (prep_word);

     return 0;
 }

 /* character length of the first n byte of the input word */
 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
 {
     int i = 0;
     int j = 0;
     while (j < n && word[j] != '\0') {
         i++;
         for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
     }
     return i;
 }

 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
 	char *** rep, int ** pos, int ** cut, int lhmin)
 {
     int i, j;
     for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {
             // check length of the non-standard part
             if (*rep && *pos && *cut && (*rep)[j]) {
                 char * rh = strchr((*rep)[j], '=');
                 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
                         hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
                     free((*rep)[j]);
                     (*rep)[j] = NULL;
                     hyphens[j] = '0';
                 }
             } else {
                 hyphens[j] = '0';
             }
             j++;
         } while (utf8 && (word[j + 1] & 0xc0) == 0xc0);
     return 0;
 }

 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
 	char *** rep, int ** pos, int ** cut, int rhmin)
 {
     int i;
     int j = word_size - 2;
     for (i = 1; i < rhmin && j > 0; j--) {
         // check length of the non-standard part
         if (*rep && *pos && *cut && (*rep)[j]) {
             char * rh = strchr((*rep)[j], '=');
             if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
                     hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
                 free((*rep)[j]);
                 (*rep)[j] = NULL;
                 hyphens[j] = '0';
             }
         } else {
             hyphens[j] = '0';
         }
         if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
     }
     return 0;
 }

 // recursive function for compound level hyphenation
 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
     char * hyphens, char *** rep, int ** pos, int ** cut,
     int clhmin, int crhmin, int lend, int rend)
 {
     char prep_word_buf[MAX_WORD];
     char *prep_word;
     int i, j, k;
     int state;
     char ch;
     HyphenState *hstate;
     char *match;
     char *repl;
     signed char replindex;
     signed char replcut;
     int offset;
     int matchlen_buf[MAX_CHARS];
     int matchindex_buf[MAX_CHARS];
     char * matchrepl_buf[MAX_CHARS];
     int * matchlen;
     int * matchindex;
     char ** matchrepl;
     int isrepl = 0;
     int nHyphCount;

     if (word_size + 3 < MAX_CHARS) {
         prep_word = prep_word_buf;
         matchlen = matchlen_buf;
         matchindex = matchindex_buf;
         matchrepl = matchrepl_buf;
     } else {
         prep_word = hnj_malloc (word_size + 3);
         matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
         matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
         matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
     }

     j = 0;
     prep_word[j++] = '.';

     for (i = 0; i < word_size; i++)
         prep_word[j++] = word[i];

     prep_word[j++] = '.';
     prep_word[j] = '\0';

     for (i = 0; i < j; i++)
         hyphens[i] = '0';

 #ifdef VERBOSE
     printf ("prep_word = %s\n", prep_word);
 #endif

     /* now, run the finite state machine */
     state = 0;
     for (i = 0; i < j; i++)
     {
         ch = prep_word[i];
         for (;;)
         {

             if (state == -1) {
                 /* return 1; */
                 /*  KBH: FIXME shouldn't this be as follows? */
                 state = 0;
                 goto try_next_letter;
             }

 #ifdef VERBOSE
             char *state_str;
             state_str = get_state_str (state);

             for (k = 0; k < i - strlen (state_str); k++)
                 putchar (' ');
             printf ("%s", state_str);
 #endif

             hstate = &dict->states[state];
             for (k = 0; k < hstate->num_trans; k++)
                 if (hstate->trans[k].ch == ch)
                 {
                     state = hstate->trans[k].new_state;
                     goto found_state;
                 }
             state = hstate->fallback_state;
 #ifdef VERBOSE
             printf (" falling back, fallback_state %d\n", state);
 #endif
         }
       found_state:
 #ifdef VERBOSE
         printf ("found state %d\n",state);
 #endif
         /* Additional optimization is possible here - especially,
            elimination of trailing zeroes from the match. Leading zeroes
            have already been optimized. */
         match = dict->states[state].match;
         repl = dict->states[state].repl;
         replindex = dict->states[state].replindex;
         replcut = dict->states[state].replcut;
         /* replacing rules not handled by hyphen_hyphenate() */
         if (match)
         {
             offset = i + 1 - strlen (match);
 #ifdef VERBOSE
             for (k = 0; k < offset; k++)
                 putchar (' ');
             printf ("%s (%s)\n", match, repl);
 #endif
             if (repl) {
                 if (!isrepl) for(; isrepl < word_size; isrepl++) {
                         matchrepl[isrepl] = NULL;
                         matchindex[isrepl] = -1;
                     }
                 matchlen[offset + replindex] = replcut;
             }
             /* This is a linear search because I tried a binary search and
                found it to be just a teeny bit slower. */
             for (k = 0; match[k]; k++) {
                 if ((hyphens[offset + k] < match[k])) {
                     hyphens[offset + k] = match[k];
                     if (match[k]&1) {
                         matchrepl[offset + k] = repl;
                         if (repl && (k >= replindex) && (k <= replindex + replcut)) {
                             matchindex[offset + replindex] = offset + k;
                         }
                     }
                 }
             }

         }

         /* KBH: we need this to make sure we keep looking in a word */
         /* for patterns even if the current character is not known in state 0 */
         /* since patterns for hyphenation may occur anywhere in the word */
       try_next_letter: ;

     }
 #ifdef VERBOSE
     for (i = 0; i < j; i++)
         putchar (hyphens[i]);
     putchar ('\n');
 #endif

     for (i = 0; i < j - 3; i++)
 #if 0
         if (hyphens[i + 1] & 1)
             hyphens[i] = '-';
 #else
     hyphens[i] = hyphens[i + 1];
 #endif
     for (; i < word_size; i++)
         hyphens[i] = '0';
     hyphens[word_size] = '\0';

     /* now create a new char string showing hyphenation positions */
     /* count the hyphens and allocate space for the new hyphenated string */
     nHyphCount = 0;
     for (i = 0; i < word_size; i++)
         if (hyphens[i]&1)
             nHyphCount++;
     j = 0;
     for (i = 0; i < word_size; i++) {
         if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
             if (rep && pos && cut) {
                 if (!*rep && !*pos && !*cut) {
                     int k;
                     *rep = (char **) malloc(sizeof(char *) * word_size);
                     *pos = (int *) malloc(sizeof(int) * word_size);
                     *cut = (int *) malloc(sizeof(int) * word_size);
                     for (k = 0; k < word_size; k++) {
                         (*rep)[k] = NULL;
                         (*pos)[k] = 0;
                         (*cut)[k] = 0;
                     }
                 }
                 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
                 (*pos)[matchindex[i] - 1] = matchindex[i] - i;
                 (*cut)[matchindex[i] - 1] = matchlen[i];
             }
             j += strlen(matchrepl[matchindex[i]]);
             i += matchlen[i] - 1;
         }
     }

     if (matchrepl != matchrepl_buf) {
         hnj_free (matchrepl);
         hnj_free (matchlen);
         hnj_free (matchindex);
     }

     // recursive hyphenation of the first (compound) level segments
     if (dict->nextlevel) {
         char * rep2_buf[MAX_WORD];
         int pos2_buf[MAX_WORD];
         int cut2_buf[MAX_WORD];
         char hyphens2_buf[MAX_WORD];
         char ** rep2;
         int * pos2;
         int * cut2;
         char * hyphens2;
         int begin = 0;
         if (word_size < MAX_CHARS) {
             rep2 = rep2_buf;
             pos2 = pos2_buf;
             cut2 = cut2_buf;
             hyphens2 = hyphens2_buf;
         } else {
             rep2 = hnj_malloc (word_size * sizeof(char *));
             pos2 = hnj_malloc (word_size * sizeof(int));
             cut2 = hnj_malloc (word_size * sizeof(int));
             hyphens2 = hnj_malloc (word_size);
         }
         for (i = 0; i < word_size; i++) rep2[i] = NULL;
         for (i = 0; i < word_size; i++)
             if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
                 if (i - begin > 1) {
                     int hyph = 0;
                     prep_word[i + 2] = '\0';
                     /* non-standard hyphenation at compound boundary (Schiffahrt) */
                     if (*rep && *pos && *cut && (*rep)[i]) {
                         char * l = strchr((*rep)[i], '=');
                         strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
                         if (l) {
                             hyph = (l - (*rep)[i]) - (*pos)[i];
                             prep_word[2 + i + hyph] = '\0';
                         }
                     }
                     hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
                         hyphens2, &rep2, &pos2, &cut2, clhmin,
                         crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
                     for (j = 0; j < i - begin - 1; j++) {
                         hyphens[begin + j] = hyphens2[j];
                         if (rep2[j] && rep && pos && cut) {
                             if (!*rep && !*pos && !*cut) {
                                 int k;
                                 *rep = (char **) malloc(sizeof(char *) * word_size);
                                 *pos = (int *) malloc(sizeof(int) * word_size);
                                 *cut = (int *) malloc(sizeof(int) * word_size);
                                 for (k = 0; k < word_size; k++) {
                                     (*rep)[k] = NULL;
                                     (*pos)[k] = 0;
                                     (*cut)[k] = 0;
                                 }
                             }
                             (*rep)[begin + j] = rep2[j];
                             (*pos)[begin + j] = pos2[j];
                             (*cut)[begin + j] = cut2[j];
                         }
                     }
                     prep_word[i + 2] = word[i + 1];
                     if (*rep && *pos && *cut && (*rep)[i]) {
                         strcpy(prep_word + 1, word);
                     }
                 }
                 begin = i + 1;
                 for (j = 0; j < word_size; j++) rep2[j] = NULL;
             }

         // non-compound
         if (begin == 0) {
             hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
                 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
             if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
                 rep, pos, cut, clhmin);
             if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
                 rep, pos, cut, crhmin);
         }

         if (rep2 != rep2_buf) {
             free(rep2);
             free(cut2);
             free(pos2);
             free(hyphens2);
         }
     }

     if (prep_word != prep_word_buf) hnj_free (prep_word);
     return 0;
 }

 /* UTF-8 normalization of hyphen and non-standard positions */
 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
 	char *** rep, int ** pos, int ** cut)
 {
     if ((((unsigned char) word[0]) >> 6) == 2) {
         fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
         return 1;
     }

     /* calculate UTF-8 character positions */
     int i, j, k;
     for (i = 0, j = -1; i < word_size; i++) {
         /* beginning of an UTF-8 character (not '10' start bits) */
         if ((((unsigned char) word[i]) >> 6) != 2) j++;
         hyphens[j] = hyphens[i];
         if (rep && pos && cut && *rep && *pos && *cut) {
             int l = (*pos)[i];
             (*pos)[j] = 0;
             for (k = 0; k < l; k++) {
                 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
             }
             k = i - l + 1;
             l = k + (*cut)[i];
             (*cut)[j] = 0;
             for (; k < l; k++) {
                 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
             }
             (*rep)[j] = (*rep)[i];
             if (j < i) {
                 (*rep)[i] = NULL;
                 (*pos)[i] = 0;
                 (*cut)[i] = 0;
             }
         }
     }
     hyphens[j + 1] = '\0';
     return 0;
 }

 /* get the word with all possible hyphenations (output: hyphword) */
 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
     char * hyphword, char *** rep, int ** pos, int ** cut)
 {
     int i, j;
     for (i = 0, j = 0; i < l; i++, j++) {
         if (hyphens[i]&1) {
             hyphword[j] = word[i];
             if (*rep && *pos && *cut && (*rep)[i]) {
                 strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
                 j += strlen((*rep)[i]) - (*pos)[i];
                 i += (*cut)[i] - (*pos)[i];
             } else hyphword[++j] = '=';
         } else hyphword[j] = word[i];
     }
     hyphword[j] = '\0';
 }


 /* main api function with default hyphenmin parameters */
 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
     const char *word, int word_size, char * hyphens,
     char *hyphword, char *** rep, int ** pos, int ** cut)
 {
     hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
         dict->clhmin, dict->crhmin, 1, 1);
     hnj_hyphen_lhmin(dict->utf8, word, word_size,
         hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
     hnj_hyphen_rhmin(dict->utf8, word, word_size,
         hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
     if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
     if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
     return 0;
 }

 /* previous main api function with hyphenmin parameters */
 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
 	const char *word, int word_size, char * hyphens,
 	char *hyphword, char *** rep, int ** pos, int ** cut,
 	int lhmin, int rhmin, int clhmin, int crhmin)
 {
     lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
     rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
     hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
         clhmin, crhmin, 1, 1);
     hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
         rep, pos, cut, (lhmin > 0 ? lhmin : 2));
     hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
         rep, pos, cut, (rhmin > 0 ? rhmin : 2));
     if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
     if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
     return 0;
 }