i18n/strmatch.cpp - platform/external/icu4c.git - Git at Google

 /*
 **********************************************************************
 *   Copyright (c) 2001-2012, International Business Machines Corporation
 *   and others.  All Rights Reserved.
 **********************************************************************
 *   Date        Name        Description
 *   07/23/01    aliu        Creation.
 **********************************************************************
 */

 #include "unicode/utypes.h"

 #if !UCONFIG_NO_TRANSLITERATION

 #include "strmatch.h"
 #include "rbt_data.h"
 #include "util.h"
 #include "unicode/uniset.h"
 #include "unicode/utf16.h"

 U_NAMESPACE_BEGIN

 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)

 StringMatcher::StringMatcher(const UnicodeString& theString,
                              int32_t start,
                              int32_t limit,
                              int32_t segmentNum,
                              const TransliterationRuleData& theData) :
     data(&theData),
     segmentNumber(segmentNum),
     matchStart(-1),
     matchLimit(-1)
 {
     theString.extractBetween(start, limit, pattern);
 }

 StringMatcher::StringMatcher(const StringMatcher& o) :
     UnicodeFunctor(o),
     UnicodeMatcher(o),
     UnicodeReplacer(o),
     pattern(o.pattern),
     data(o.data),
     segmentNumber(o.segmentNumber),
     matchStart(o.matchStart),
     matchLimit(o.matchLimit)
 {
 }

 /**
  * Destructor
  */
 StringMatcher::~StringMatcher() {
 }

 /**
  * Implement UnicodeFunctor
  */
 UnicodeFunctor* StringMatcher::clone() const {
     return new StringMatcher(*this);
 }

 /**
  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
  * and return the pointer.
  */
 UnicodeMatcher* StringMatcher::toMatcher() const {
   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);

   return nonconst_base;
 }

 /**
  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
  * and return the pointer.
  */
 UnicodeReplacer* StringMatcher::toReplacer() const {
   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);

   return nonconst_base;
 }

 /**
  * Implement UnicodeMatcher
  */
 UMatchDegree StringMatcher::matches(const Replaceable& text,
                                     int32_t& offset,
                                     int32_t limit,
                                     UBool incremental) {
     int32_t i;
     int32_t cursor = offset;
     if (limit < cursor) {
         // Match in the reverse direction
         for (i=pattern.length()-1; i>=0; --i) {
             UChar keyChar = pattern.charAt(i);
             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
             if (subm == 0) {
                 if (cursor > limit &&
                     keyChar == text.charAt(cursor)) {
                     --cursor;
                 } else {
                     return U_MISMATCH;
                 }
             } else {
                 UMatchDegree m =
                     subm->matches(text, cursor, limit, incremental);
                 if (m != U_MATCH) {
                     return m;
                 }
             }
         }
         // Record the match position, but adjust for a normal
         // forward start, limit, and only if a prior match does not
         // exist -- we want the rightmost match.
         if (matchStart < 0) {
             matchStart = cursor+1;
             matchLimit = offset+1;
         }
     } else {
         for (i=0; i<pattern.length(); ++i) {
             if (incremental && cursor == limit) {
                 // We've reached the context limit without a mismatch and
                 // without completing our match.
                 return U_PARTIAL_MATCH;
             }
             UChar keyChar = pattern.charAt(i);
             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
             if (subm == 0) {
                 // Don't need the cursor < limit check if
                 // incremental is TRUE (because it's done above); do need
                 // it otherwise.
                 if (cursor < limit &&
                     keyChar == text.charAt(cursor)) {
                     ++cursor;
                 } else {
                     return U_MISMATCH;
                 }
             } else {
                 UMatchDegree m =
                     subm->matches(text, cursor, limit, incremental);
                 if (m != U_MATCH) {
                     return m;
                 }
             }
         }
         // Record the match position
         matchStart = offset;
         matchLimit = cursor;
     }

     offset = cursor;
     return U_MATCH;
 }

 /**
  * Implement UnicodeMatcher
  */
 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
                                         UBool escapeUnprintable) const
 {
     result.truncate(0);
     UnicodeString str, quoteBuf;
     if (segmentNumber > 0) {
         result.append((UChar)40); /*(*/
     }
     for (int32_t i=0; i<pattern.length(); ++i) {
         UChar keyChar = pattern.charAt(i);
         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
         if (m == 0) {
             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
         } else {
             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
                          TRUE, escapeUnprintable, quoteBuf);
         }
     }
     if (segmentNumber > 0) {
         result.append((UChar)41); /*)*/
     }
     // Flush quoteBuf out to result
     ICU_Utility::appendToRule(result, -1,
                               TRUE, escapeUnprintable, quoteBuf);
     return result;
 }

 /**
  * Implement UnicodeMatcher
  */
 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
     if (pattern.length() == 0) {
         return TRUE;
     }
     UChar32 c = pattern.char32At(0);
     const UnicodeMatcher *m = data->lookupMatcher(c);
     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
 }

 /**
  * Implement UnicodeMatcher
  */
 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
     UChar32 ch;
     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
         ch = pattern.char32At(i);
         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
         if (matcher == NULL) {
             toUnionTo.add(ch);
         } else {
             matcher->addMatchSetTo(toUnionTo);
         }
     }
 }

 /**
  * UnicodeReplacer API
  */
 int32_t StringMatcher::replace(Replaceable& text,
                                int32_t start,
                                int32_t limit,
                                int32_t& /*cursor*/) {

     int32_t outLen = 0;

     // Copy segment with out-of-band data
     int32_t dest = limit;
     // If there was no match, that means that a quantifier
     // matched zero-length.  E.g., x (a)* y matched "xy".
     if (matchStart >= 0) {
         if (matchStart != matchLimit) {
             text.copy(matchStart, matchLimit, dest);
             outLen = matchLimit - matchStart;
         }
     }

     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text

     return outLen;
 }

 /**
  * UnicodeReplacer API
  */
 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
                                                 UBool /*escapeUnprintable*/) const {
     // assert(segmentNumber > 0);
     rule.truncate(0);
     rule.append((UChar)0x0024 /*$*/);
     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
     return rule;
 }

 /**
  * Remove any match info.  This must be called before performing a
  * set of matches with this segment.
  */
  void StringMatcher::resetMatch() {
     matchStart = matchLimit = -1;
 }

 /**
  * Union the set of all characters that may output by this object
  * into the given set.
  * @param toUnionTo the set into which to union the output characters
  */
 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
     // The output of this replacer varies; it is the source text between
     // matchStart and matchLimit.  Since this varies depending on the
     // input text, we can't compute it here.  We can either do nothing
     // or we can add ALL characters to the set.  It's probably more useful
     // to do nothing.
 }

 /**
  * Implement UnicodeFunctor
  */
 void StringMatcher::setData(const TransliterationRuleData* d) {
     data = d;
     int32_t i = 0;
     while (i<pattern.length()) {
         UChar32 c = pattern.char32At(i);
         UnicodeFunctor* f = data->lookup(c);
         if (f != NULL) {
             f->setData(data);
         }
         i += U16_LENGTH(c);
     }
 }

 U_NAMESPACE_END

 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

 //eof
	/*
	**********************************************************************
	* Copyright (c) 2001-2012, International Business Machines Corporation
	* and others. All Rights Reserved.
	**********************************************************************
	* Date Name Description
	* 07/23/01 aliu Creation.
	**********************************************************************
	*/

	#include "unicode/utypes.h"

	#if !UCONFIG_NO_TRANSLITERATION

	#include "strmatch.h"
	#include "rbt_data.h"
	#include "util.h"
	#include "unicode/uniset.h"
	#include "unicode/utf16.h"

	U_NAMESPACE_BEGIN

	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)

	StringMatcher::StringMatcher(const UnicodeString& theString,
	int32_t start,
	int32_t limit,
	int32_t segmentNum,
	const TransliterationRuleData& theData) :
	data(&theData),
	segmentNumber(segmentNum),
	matchStart(-1),
	matchLimit(-1)
	{
	theString.extractBetween(start, limit, pattern);
	}

	StringMatcher::StringMatcher(const StringMatcher& o) :
	UnicodeFunctor(o),
	UnicodeMatcher(o),
	UnicodeReplacer(o),
	pattern(o.pattern),
	data(o.data),
	segmentNumber(o.segmentNumber),
	matchStart(o.matchStart),
	matchLimit(o.matchLimit)
	{
	}

	/**
	* Destructor
	*/
	StringMatcher::~StringMatcher() {
	}

	/**
	* Implement UnicodeFunctor
	*/
	UnicodeFunctor* StringMatcher::clone() const {
	return new StringMatcher(*this);
	}

	/**
	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
	* and return the pointer.
	*/
	UnicodeMatcher* StringMatcher::toMatcher() const {
	StringMatcher nonconst_this = const_cast<StringMatcher >(this);
	UnicodeMatcher nonconst_base = static_cast<UnicodeMatcher >(nonconst_this);

	return nonconst_base;
	}

	/**
	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
	* and return the pointer.
	*/
	UnicodeReplacer* StringMatcher::toReplacer() const {
	StringMatcher nonconst_this = const_cast<StringMatcher >(this);
	UnicodeReplacer nonconst_base = static_cast<UnicodeReplacer >(nonconst_this);

	return nonconst_base;
	}

	/**
	* Implement UnicodeMatcher
	*/
	UMatchDegree StringMatcher::matches(const Replaceable& text,
	int32_t& offset,
	int32_t limit,
	UBool incremental) {
	int32_t i;
	int32_t cursor = offset;
	if (limit < cursor) {
	// Match in the reverse direction
	for (i=pattern.length()-1; i>=0; --i) {
	UChar keyChar = pattern.charAt(i);
	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
	if (subm == 0) {
	if (cursor > limit &&
	keyChar == text.charAt(cursor)) {
	--cursor;
	} else {
	return U_MISMATCH;
	}
	} else {
	UMatchDegree m =
	subm->matches(text, cursor, limit, incremental);
	if (m != U_MATCH) {
	return m;
	}
	}
	}
	// Record the match position, but adjust for a normal
	// forward start, limit, and only if a prior match does not
	// exist -- we want the rightmost match.
	if (matchStart < 0) {
	matchStart = cursor+1;
	matchLimit = offset+1;
	}
	} else {
	for (i=0; i<pattern.length(); ++i) {
	if (incremental && cursor == limit) {
	// We've reached the context limit without a mismatch and
	// without completing our match.
	return U_PARTIAL_MATCH;
	}
	UChar keyChar = pattern.charAt(i);
	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
	if (subm == 0) {
	// Don't need the cursor < limit check if
	// incremental is TRUE (because it's done above); do need
	// it otherwise.
	if (cursor < limit &&
	keyChar == text.charAt(cursor)) {
	++cursor;
	} else {
	return U_MISMATCH;
	}
	} else {
	UMatchDegree m =
	subm->matches(text, cursor, limit, incremental);
	if (m != U_MATCH) {
	return m;
	}
	}
	}
	// Record the match position
	matchStart = offset;
	matchLimit = cursor;
	}

	offset = cursor;
	return U_MATCH;
	}

	/**
	* Implement UnicodeMatcher
	*/
	UnicodeString& StringMatcher::toPattern(UnicodeString& result,
	UBool escapeUnprintable) const
	{
	result.truncate(0);
	UnicodeString str, quoteBuf;
	if (segmentNumber > 0) {
	result.append((UChar)40); /(/
	}
	for (int32_t i=0; i<pattern.length(); ++i) {
	UChar keyChar = pattern.charAt(i);
	const UnicodeMatcher* m = data->lookupMatcher(keyChar);
	if (m == 0) {
	ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
	} else {
	ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
	TRUE, escapeUnprintable, quoteBuf);
	}
	}
	if (segmentNumber > 0) {
	result.append((UChar)41); /)/
	}
	// Flush quoteBuf out to result
	ICU_Utility::appendToRule(result, -1,
	TRUE, escapeUnprintable, quoteBuf);
	return result;
	}

	/**
	* Implement UnicodeMatcher
	*/
	UBool StringMatcher::matchesIndexValue(uint8_t v) const {
	if (pattern.length() == 0) {
	return TRUE;
	}
	UChar32 c = pattern.char32At(0);
	const UnicodeMatcher *m = data->lookupMatcher(c);
	return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
	}

	/**
	* Implement UnicodeMatcher
	*/
	void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
	UChar32 ch;
	for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
	ch = pattern.char32At(i);
	const UnicodeMatcher* matcher = data->lookupMatcher(ch);
	if (matcher == NULL) {
	toUnionTo.add(ch);
	} else {
	matcher->addMatchSetTo(toUnionTo);
	}
	}
	}

	/**
	* UnicodeReplacer API
	*/
	int32_t StringMatcher::replace(Replaceable& text,
	int32_t start,
	int32_t limit,
	int32_t& /cursor/) {

	int32_t outLen = 0;

	// Copy segment with out-of-band data
	int32_t dest = limit;
	// If there was no match, that means that a quantifier
	// matched zero-length. E.g., x (a)* y matched "xy".
	if (matchStart >= 0) {
	if (matchStart != matchLimit) {
	text.copy(matchStart, matchLimit, dest);
	outLen = matchLimit - matchStart;
	}
	}

	text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text

	return outLen;
	}

	/**
	* UnicodeReplacer API
	*/
	UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
	UBool /escapeUnprintable/) const {
	// assert(segmentNumber > 0);
	rule.truncate(0);
	rule.append((UChar)0x0024 /$/);
	ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
	return rule;
	}

	/**
	* Remove any match info. This must be called before performing a
	* set of matches with this segment.
	*/
	void StringMatcher::resetMatch() {
	matchStart = matchLimit = -1;
	}

	/**
	* Union the set of all characters that may output by this object
	* into the given set.
	* @param toUnionTo the set into which to union the output characters
	*/
	void StringMatcher::addReplacementSetTo(UnicodeSet& /toUnionTo/) const {
	// The output of this replacer varies; it is the source text between
	// matchStart and matchLimit. Since this varies depending on the
	// input text, we can't compute it here. We can either do nothing
	// or we can add ALL characters to the set. It's probably more useful
	// to do nothing.
	}

	/**
	* Implement UnicodeFunctor
	*/
	void StringMatcher::setData(const TransliterationRuleData* d) {
	data = d;
	int32_t i = 0;
	while (i<pattern.length()) {
	UChar32 c = pattern.char32At(i);
	UnicodeFunctor* f = data->lookup(c);
	if (f != NULL) {
	f->setData(data);
	}
	i += U16_LENGTH(c);
	}
	}

	U_NAMESPACE_END

	#endif /* #if !UCONFIG_NO_TRANSLITERATION */

	//eof