Backport BreakIterator::refreshInputText to icu4c 4.8. Corresponds to these two patches: 1. http://bugs.icu-project.org/trac/changeset/30145 2. http://bugs.icu-project.org/trac/changeset/30201 For icu4c bug http://bugs.icu-project.org/trac/ticket/8490. (cherry-pick of 0745c569f807bddcfc8d9f21eba504955fcdba98.) Bug: 7288264 Bug: 7307154 Change-Id: Ice80d53c2f5722bb24ad841a943252b41b97cfa6

commit: 4fceb0aeb072e9c6879c37fbcdcef2c4286c4719 [log] [tgz]
author: Elliott Hughes <enh@google.com> Mon Oct 08 13:16:08 2012 -0700
committer: Elliott Hughes <enh@google.com> Mon Oct 08 17:41:37 2012 -0700
tree: 3dfb4722040401f394856af95c593e3ad423cbf4
parent: e04f6a6eb7cc29848023370321cb049b2e6179ad [diff]
diff --git a/common/rbbi.cpp b/common/rbbi.cpp
index 7196f04..ddee7b4 100644
--- a/common/rbbi.cpp
+++ b/common/rbbi.cpp

@@ -486,6 +486,37 @@
 }
 
 
+/**
+ *  Provide a new UText for the input text.  Must reference text with contents identical
+ *  to the original.
+ *  Intended for use with text data originating in Java (garbage collected) environments
+ *  where the data may be moved in memory at arbitrary times.
+ */
+RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    if (input == NULL) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return *this;
+    }
+    int64_t pos = utext_getNativeIndex(fText);
+    //  Shallow read-only clone of the new UText into the existing input UText
+    fText = utext_clone(fText, input, FALSE, TRUE, &status);
+    if (U_FAILURE(status)) {
+        return *this;
+    }
+    utext_setNativeIndex(fText, pos);
+    if (utext_getNativeIndex(fText) != pos) {
+        // Sanity check.  The new input utext is supposed to have the exact same
+        // contents as the old.  If we can't set to the same position, it doesn't.
+        // The contents underlying the old utext might be invalid at this point,
+        // so it's not safe to check directly.
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+    }
+    return *this;
+}
+
 
 /**
  * Sets the current iteration position to the beginning of the text.

diff --git a/common/ubrk.cpp b/common/ubrk.cpp
index 141913f..70d7e48 100644
--- a/common/ubrk.cpp
+++ b/common/ubrk.cpp

@@ -1,6 +1,6 @@
 /*
 ********************************************************************************
-*   Copyright (C) 1996-2008, International Business Machines
+*   Copyright (C) 1996-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 ********************************************************************************
 */
@@ -300,4 +300,14 @@
 }
 
 
+void ubrk_refreshUText(UBreakIterator *bi,
+                       UText          *text,
+                       UErrorCode     *status)
+{
+    BreakIterator *bii = reinterpret_cast<BreakIterator *>(bi);
+    bii->refreshInputText(text, *status);
+}
+
+
+
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

diff --git a/common/unicode/brkiter.h b/common/unicode/brkiter.h
index bdd3cc7..6cae690 100644
--- a/common/unicode/brkiter.h
+++ b/common/unicode/brkiter.h

@@ -1,6 +1,6 @@
 /*
 ********************************************************************************
-*   Copyright (C) 1997-2010, International Business Machines
+*   Copyright (C) 1997-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 ********************************************************************************
 *
@@ -514,6 +514,33 @@
      */
     const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
 
+    /**
+     *  Set the subject text string upon which the break iterator is operating
+     *  without changing any other aspect of the matching state.
+     *  The new and previous text strings must have the same content.
+     *
+     *  This function is intended for use in environments where ICU is operating on
+     *  strings that may move around in memory.  It provides a mechanism for notifying
+     *  ICU that the string has been relocated, and providing a new UText to access the
+     *  string in its new position.
+     *
+     *  Note that the break iterator implementation never copies the underlying text
+     *  of a string being processed, but always operates directly on the original text
+     *  provided by the user. Refreshing simply drops the references to the old text
+     *  and replaces them with references to the new.
+     *
+     *  Caution:  this function is normally used only by very specialized,
+     *  system-level code.  One example use case is with garbage collection that moves
+     *  the text in memory.
+     *
+     * @param input      The new (moved) text string.
+     * @param status     Receives errors detected by this function.
+     * @return           *this
+     *
+     * @draft ICU 5.0
+     */
+    virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
+
  private:
     static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
     static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);

diff --git a/common/unicode/rbbi.h b/common/unicode/rbbi.h
index f93b577..529a589 100644
--- a/common/unicode/rbbi.h
+++ b/common/unicode/rbbi.h

@@ -633,6 +633,33 @@
      */
     virtual const uint8_t *getBinaryRules(uint32_t &length);
 
+    /**
+     *  Set the subject text string upon which the break iterator is operating
+     *  without changing any other aspect of the matching state.
+     *  The new and previous text strings must have the same content.
+     *
+     *  This function is intended for use in environments where ICU is operating on
+     *  strings that may move around in memory.  It provides a mechanism for notifying
+     *  ICU that the string has been relocated, and providing a new UText to access the
+     *  string in its new position.
+     *
+     *  Note that the break iterator implementation never copies the underlying text
+     *  of a string being processed, but always operates directly on the original text
+     *  provided by the user. Refreshing simply drops the references to the old text
+     *  and replaces them with references to the new.
+     *
+     *  Caution:  this function is normally used only by very specialized,
+     *  system-level code.  One example use case is with garbage collection that moves
+     *  the text in memory.
+     *
+     * @param input      The new (moved) text string.
+     * @param status     Receives errors detected by this function.
+     * @return           *this
+     *
+     * @draft ICU 5.0
+     */
+    virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
+
 
 protected:
     //=======================================================================

diff --git a/common/unicode/ubrk.h b/common/unicode/ubrk.h
index 96dd2af..c596861 100644
--- a/common/unicode/ubrk.h
+++ b/common/unicode/ubrk.h

@@ -496,6 +496,37 @@
 ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
 
 
+/**
+  *  Set the subject text string upon which the break iterator is operating
+  *  without changing any other aspect of the state.
+  *  The new and previous text strings must have the same content.
+  *
+  *  This function is intended for use in environments where ICU is operating on
+  *  strings that may move around in memory.  It provides a mechanism for notifying
+  *  ICU that the string has been relocated, and providing a new UText to access the
+  *  string in its new position.
+  *
+  *  Note that the break iterator never copies the underlying text
+  *  of a string being processed, but always operates directly on the original text
+  *  provided by the user. Refreshing simply drops the references to the old text
+  *  and replaces them with references to the new.
+  *
+  *  Caution:  this function is normally used only by very specialized
+  *            system-level code.   One example use case is with garbage collection
+  *            that moves the text in memory.
+  *
+  * @param bi         The break iterator.
+  * @param text       The new (moved) text string.
+  * @param status     Receives errors detected by this function.
+  *
+  * @draft ICU 5.0
+  */
+U_DRAFT void U_EXPORT2
+ubrk_refreshUText(UBreakIterator *bi,
+                       UText          *text,
+                       UErrorCode     *status);
+
+
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
 
 #endif

diff --git a/test/cintltst/cbiapts.c b/test/cintltst/cbiapts.c
index b3c1f7a..1dab12f 100644
--- a/test/cintltst/cbiapts.c
+++ b/test/cintltst/cbiapts.c

@@ -44,6 +44,7 @@
 static void TestBreakIteratorStatusVec(void);
 static void TestBreakIteratorUText(void);
 static void TestBreakIteratorTailoring(void);
+static void TestBreakIteratorRefresh(void);
 
 void addBrkIterAPITest(TestNode** root);
 
@@ -58,6 +59,7 @@
     addTest(root, &TestBreakIteratorRuleError, "tstxtbd/cbiapts/TestBreakIteratorRuleError");
     addTest(root, &TestBreakIteratorStatusVec, "tstxtbd/cbiapts/TestBreakIteratorStatusVec");
     addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
+    addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
 }
 
 #define CLONETEST_ITERATOR_COUNT 2
@@ -823,4 +825,52 @@
     }
 }
 
+
+static void TestBreakIteratorRefresh(void) {
+    /*
+     *  RefreshInput changes out the input of a Break Iterator without
+     *    changing anything else in the iterator's state.  Used with Java JNI,
+     *    when Java moves the underlying string storage.   This test
+     *    runs a ubrk_next() repeatedly, moving the text in the middle of the sequence.
+     *    The right set of boundaries should still be found.
+     */
+    UChar testStr[]  = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0};  /* = " A B C D"  */
+    UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  0};
+    UErrorCode status = U_ZERO_ERROR;
+    UBreakIterator *bi;
+    UText ut1 = UTEXT_INITIALIZER;
+    UText ut2 = UTEXT_INITIALIZER;
+    
+    bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
+    TEST_ASSERT_SUCCESS(status);
+
+    utext_openUChars(&ut1, testStr, -1, &status);
+    TEST_ASSERT_SUCCESS(status);
+    ubrk_setUText(bi, &ut1, &status);
+    TEST_ASSERT_SUCCESS(status);
+
+    /* Line boundaries will occur before each letter in the original string */
+    TEST_ASSERT(1 == ubrk_next(bi));
+    TEST_ASSERT(3 == ubrk_next(bi));
+    
+    /* Move the string, kill the original string.  */
+    u_strcpy(movedStr, testStr);
+    u_memset(testStr, 0x20, u_strlen(testStr));
+    utext_openUChars(&ut2, movedStr, -1, &status);
+    TEST_ASSERT_SUCCESS(status);
+    ubrk_refreshUText(bi, &ut2, &status);
+    TEST_ASSERT_SUCCESS(status);
+
+    /* Find the following matches, now working in the moved string. */
+    TEST_ASSERT(5 == ubrk_next(bi));
+    TEST_ASSERT(7 == ubrk_next(bi));
+    TEST_ASSERT(8 == ubrk_next(bi));
+    TEST_ASSERT(UBRK_DONE == ubrk_next(bi));
+    TEST_ASSERT_SUCCESS(status);
+
+    ubrk_close(bi);
+    utext_close(&ut1);
+    utext_close(&ut2);
+}
+
 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */

diff --git a/test/intltest/rbbiapts.cpp b/test/intltest/rbbiapts.cpp
index 89afdb6..cb40076 100644
--- a/test/intltest/rbbiapts.cpp
+++ b/test/intltest/rbbiapts.cpp

@@ -1122,6 +1122,54 @@
     }
 }
 
+
+void RBBIAPITest::TestRefreshInputText() {
+    /*
+     *  RefreshInput changes out the input of a Break Iterator without
+     *    changing anything else in the iterator's state.  Used with Java JNI,
+     *    when Java moves the underlying string storage.   This test
+     *    runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence.
+     *    The right set of boundaries should still be found.
+     */
+    UChar testStr[]  = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0};  /* = " A B C D"  */
+    UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  0};
+    UErrorCode status = U_ZERO_ERROR;
+    UText ut1 = UTEXT_INITIALIZER;
+    UText ut2 = UTEXT_INITIALIZER;
+    RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
+    TEST_ASSERT_SUCCESS(status);
+
+    utext_openUChars(&ut1, testStr, -1, &status);
+    TEST_ASSERT_SUCCESS(status);
+    bi->setText(&ut1, status);
+    TEST_ASSERT_SUCCESS(status);
+
+    /* Line boundaries will occur before each letter in the original string */
+    TEST_ASSERT(1 == bi->next());
+    TEST_ASSERT(3 == bi->next());
+    
+    /* Move the string, kill the original string.  */
+    u_strcpy(movedStr, testStr);
+    u_memset(testStr, 0x20, u_strlen(testStr));
+    utext_openUChars(&ut2, movedStr, -1, &status);
+    TEST_ASSERT_SUCCESS(status);
+    RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status);
+    TEST_ASSERT_SUCCESS(status);
+    TEST_ASSERT(bi == returnedBI);
+
+    /* Find the following matches, now working in the moved string. */
+    TEST_ASSERT(5 == bi->next());
+    TEST_ASSERT(7 == bi->next());
+    TEST_ASSERT(8 == bi->next());
+    TEST_ASSERT(UBRK_DONE == bi->next());
+
+    delete bi;
+    utext_close(&ut1);
+    utext_close(&ut2);
+
+}
+
+
 //---------------------------------------------
 // runIndexedTest
 //---------------------------------------------
@@ -1153,6 +1201,7 @@
 #else
         case  9: case 10: case 11: case 12: case 13: name = "skip"; break;
 #endif
+        case 14: name = "TestRefreshInputText"; if (exec) TestRefreshInputText(); break;
 
         default: name = ""; break; // needed to end loop
     }

diff --git a/test/intltest/rbbiapts.h b/test/intltest/rbbiapts.h
index 0ce64ac..d9a25aa 100644
--- a/test/intltest/rbbiapts.h
+++ b/test/intltest/rbbiapts.h

@@ -86,6 +86,8 @@
 
     void TestRegistration();
 
+    void TestRefreshInputText();
+
     /**
      *Internal subroutines
      **/
commit	4fceb0aeb072e9c6879c37fbcdcef2c4286c4719	[log] [tgz]
author	Elliott Hughes <enh@google.com>	Mon Oct 08 13:16:08 2012 -0700
committer	Elliott Hughes <enh@google.com>	Mon Oct 08 17:41:37 2012 -0700
tree	3dfb4722040401f394856af95c593e3ad423cbf4
parent	e04f6a6eb7cc29848023370321cb049b2e6179ad [diff]