| // Copyright (C) 2011 The Libphonenumber Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Author: Lara Rennie |
| // Author: Tao Huang |
| // |
| // Implementation of a stateful class that finds and extracts telephone numbers |
| // from text. |
| |
| #include "phonenumbers/phonenumbermatcher.h" |
| |
| #ifndef I18N_PHONENUMBERS_USE_ICU_REGEXP |
| #error phonenumbermatcher depends on ICU \ |
| (i.e. I18N_PHONENUMBERS_USE_ICU_REGEXP must be set) |
| #endif // I18N_PHONENUMBERS_USE_ICU_REGEXP |
| |
| #include <ctype.h> |
| #include <stddef.h> |
| #include <limits> |
| #include <map> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include <unicode/uchar.h> |
| |
| #include "phonenumbers/alternate_format.h" |
| #include "phonenumbers/base/logging.h" |
| #include "phonenumbers/base/memory/scoped_ptr.h" |
| #include "phonenumbers/base/memory/singleton.h" |
| #include "phonenumbers/callback.h" |
| #include "phonenumbers/default_logger.h" |
| #include "phonenumbers/encoding_utils.h" |
| #include "phonenumbers/normalize_utf8.h" |
| #include "phonenumbers/phonemetadata.pb.h" |
| #include "phonenumbers/phonenumber.pb.h" |
| #include "phonenumbers/phonenumbermatch.h" |
| #include "phonenumbers/phonenumberutil.h" |
| #include "phonenumbers/regexp_adapter.h" |
| #include "phonenumbers/regexp_adapter_icu.h" |
| #include "phonenumbers/stringutil.h" |
| |
| #ifdef I18N_PHONENUMBERS_USE_RE2 |
| #include "phonenumbers/regexp_adapter_re2.h" |
| #endif // I18N_PHONENUMBERS_USE_RE2_AND_ICU |
| |
| using std::make_pair; |
| using std::map; |
| using std::numeric_limits; |
| using std::string; |
| using std::vector; |
| |
| namespace i18n { |
| namespace phonenumbers { |
| |
| namespace { |
| // Returns a regular expression quantifier with an upper and lower limit. |
| string Limit(int lower, int upper) { |
| DCHECK_GE(lower, 0); |
| DCHECK_GT(upper, 0); |
| DCHECK_LT(lower, upper); |
| return StrCat("{", lower, ",", upper, "}"); |
| } |
| |
| bool IsInvalidPunctuationSymbol(char32 character) { |
| return character == '%' || u_charType(character) == U_CURRENCY_SYMBOL; |
| } |
| |
| bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate, |
| const PhoneNumberUtil& util) { |
| // The characters 'x' and 'X' can be (1) a carrier code, in which case they |
| // always precede the national significant number or (2) an extension sign, |
| // in which case they always precede the extension number. We assume a |
| // carrier code is more than 1 digit, so the first case has to have more than |
| // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1 |
| // 'x' or 'X'. |
| size_t found; |
| found = candidate.find_first_of("xX"); |
| // We ignore the character if 'x' or 'X' appears as the last character of |
| // the string. |
| while (found != string::npos && found < candidate.length() - 1) { |
| // We only look for 'x' or 'X' in ASCII form. |
| char next_char = candidate[found + 1]; |
| if (next_char == 'x' || next_char == 'X') { |
| // This is the carrier code case, in which the 'X's always precede the |
| // national significant number. |
| ++found; |
| if (util.IsNumberMatchWithOneString( |
| number, candidate.substr(found, candidate.length() - found)) |
| != PhoneNumberUtil::NSN_MATCH) { |
| return false; |
| } |
| } else { |
| string normalized_extension(candidate.substr(found, |
| candidate.length() - found)); |
| util.NormalizeDigitsOnly(&normalized_extension); |
| if (normalized_extension != number.extension()) { |
| return false; |
| } |
| } |
| found = candidate.find_first_of("xX", found + 1); |
| } |
| return true; |
| } |
| |
| bool AllNumberGroupsRemainGrouped( |
| const PhoneNumberUtil& util, |
| const PhoneNumber& phone_number, |
| const string& normalized_candidate, |
| const vector<string>& formatted_number_groups) { |
| size_t from_index = 0; |
| // Check each group of consecutive digits are not broken into separate |
| // groupings in the normalized_candidate string. |
| for (size_t i = 0; i < formatted_number_groups.size(); ++i) { |
| // Fails if the substring of normalized_candidate starting from from_index |
| // doesn't contain the consecutive digits in formatted_number_groups.at(i). |
| from_index = normalized_candidate.find(formatted_number_groups.at(i), |
| from_index); |
| if (from_index == string::npos) { |
| return false; |
| } |
| // Moves from_index forward. |
| from_index += formatted_number_groups.at(i).length(); |
| if (i == 0 && from_index < normalized_candidate.length()) { |
| // We are at the position right after the NDC. We get the region used for |
| // formatting information based on the country code in the phone number, |
| // rather than the number itself, as we do not need to distinguish between |
| // different countries with the same country calling code and this is |
| // faster. |
| string region; |
| util.GetRegionCodeForCountryCode(phone_number.country_code(), ®ion); |
| string ndd_prefix; |
| util.GetNddPrefixForRegion(region, true, &ndd_prefix); |
| // Note although normalized_candidate might contain non-ASCII formatting |
| // characters, they won't be treated as ASCII digits when converted to a |
| // char. |
| if (!ndd_prefix.empty() && isdigit(normalized_candidate.at(from_index))) { |
| // This means there is no formatting symbol after the NDC. In this case, |
| // we only accept the number if there is no formatting symbol at all in |
| // the number, except for extensions. This is only important for |
| // countries with national prefixes. |
| string national_significant_number; |
| util.GetNationalSignificantNumber( |
| phone_number, &national_significant_number); |
| return HasPrefixString(normalized_candidate.substr( |
| from_index - formatted_number_groups.at(i).length()), |
| national_significant_number); |
| } |
| } |
| } |
| // The check here makes sure that we haven't mistakenly already used the |
| // extension to match the last group of the subscriber number. Note the |
| // extension cannot have formatting in-between digits. |
| return normalized_candidate.substr(from_index) |
| .find(phone_number.extension()) != string::npos; |
| } |
| |
| bool LoadAlternateFormats(PhoneMetadataCollection* alternate_formats) { |
| #if defined(I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS) |
| if (!alternate_formats->ParseFromArray(alternate_format_get(), |
| alternate_format_size())) { |
| LOG(ERROR) << "Could not parse binary data."; |
| return false; |
| } |
| return true; |
| #else |
| return false; |
| #endif |
| } |
| |
| } // namespace |
| |
| class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> { |
| private: |
| friend class Singleton<PhoneNumberMatcherRegExps>; |
| |
| string opening_parens_; |
| string closing_parens_; |
| string non_parens_; |
| // Limit on the number of pairs of brackets in a phone number. |
| string bracket_pair_limit_; |
| // Helper strings for the matching_brackets_ pattern. |
| // An opening bracket at the beginning may not be closed, but subsequent ones |
| // should be. It's also possible that the leading bracket was dropped, so we |
| // shouldn't be surprised if we see a closing bracket first. |
| string leading_maybe_matched_bracket_; |
| string bracket_pairs_; |
| // Limit on the number of leading (plus) characters. |
| string lead_limit_; |
| // Limit on the number of consecutive punctuation characters. |
| string punctuation_limit_; |
| // The maximum number of digits allowed in a digit-separated block. As we |
| // allow all digits in a single block, this should be set high enough to |
| // accommodate the entire national number and the international country code. |
| int digit_block_limit_; |
| // Limit on the number of blocks separated by punctuation. Uses |
| // kDigitBlockLimit since some formats use spaces to separate each digit. |
| string block_limit_; |
| // A punctuation sequence allowing white space. |
| string punctuation_; |
| // A digits block without punctuation. |
| string digit_sequence_; |
| // Punctuation that may be at the start of a phone number - brackets and plus |
| // signs. |
| string lead_class_chars_; |
| // Same as lead_class_chars_, but enclosed as a character class. |
| string lead_class_; |
| // Extra helper strings that form part of pattern_. These are stored |
| // separately since StrCat has a limit of 12 args. |
| string opening_punctuation_; |
| string optional_extn_pattern_; |
| |
| public: |
| // We use two different reg-ex factories here for performance reasons. RE2 is |
| // much faster for smaller reg-ex patterns, but the main pattern cannot be |
| // handled by RE2 in an efficient way. |
| scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_; |
| scoped_ptr<const AbstractRegExpFactory> regexp_factory_; |
| |
| // Matches strings that look like publication pages. Example: |
| // Computing Complete Answers to Queries in the Presence of Limited Access |
| // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003). |
| // |
| // The string "211-227 (2003)" is not a telephone number. |
| scoped_ptr<const RegExp> pub_pages_; |
| // Matches strings that look like dates using "/" as a separator. Examples: |
| // 3/10/2011, 31/10/96 or 08/31/95. |
| scoped_ptr<const RegExp> slash_separated_dates_; |
| // Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does |
| // not include trailing ":\d\d" -- that is covered by time_stamps_suffix_. |
| scoped_ptr<const RegExp> time_stamps_; |
| scoped_ptr<const RegExp> time_stamps_suffix_; |
| // Pattern to check that brackets match. Opening brackets should be closed |
| // within a phone number. This also checks that there is something inside the |
| // brackets. Having no brackets at all is also fine. |
| scoped_ptr<const RegExp> matching_brackets_; |
| // Matches white-space, which may indicate the end of a phone number and the |
| // start of something else (such as a neighbouring zip-code). If white-space |
| // is found, continues to match all characters that are not typically used to |
| // start a phone number. |
| scoped_ptr<const RegExp> group_separator_; |
| scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_; |
| scoped_ptr<const RegExp> capturing_ascii_digits_pattern_; |
| // Compiled reg-ex representing lead_class_; |
| scoped_ptr<const RegExp> lead_class_pattern_; |
| // Phone number pattern allowing optional punctuation. |
| scoped_ptr<const RegExp> pattern_; |
| |
| PhoneNumberMatcherRegExps() |
| : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */), |
| closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */), |
| non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")), |
| bracket_pair_limit_(Limit(0, 3)), |
| leading_maybe_matched_bracket_(StrCat( |
| "(?:[", opening_parens_, "])?", |
| "(?:", non_parens_, "+[", closing_parens_, "])?")), |
| bracket_pairs_(StrCat( |
| "(?:[", opening_parens_, "]", non_parens_, "+", |
| "[", closing_parens_, "])", bracket_pair_limit_)), |
| lead_limit_(Limit(0, 2)), |
| punctuation_limit_(Limit(0, 4)), |
| digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn + |
| PhoneNumberUtil::kMaxLengthCountryCode), |
| block_limit_(Limit(0, digit_block_limit_)), |
| punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]", |
| punctuation_limit_)), |
| digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))), |
| lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)), |
| lead_class_(StrCat("[", lead_class_chars_, "]")), |
| opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")), |
| optional_extn_pattern_(StrCat( |
| "(?i)(?:", |
| PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), |
| ")?")), |
| regexp_factory_for_pattern_(new ICURegExpFactory()), |
| #ifdef I18N_PHONENUMBERS_USE_RE2 |
| regexp_factory_(new RE2RegExpFactory()), |
| #else |
| regexp_factory_(new ICURegExpFactory()), |
| #endif // I18N_PHONENUMBERS_USE_RE2 |
| pub_pages_(regexp_factory_->CreateRegExp( |
| "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")), |
| slash_separated_dates_(regexp_factory_->CreateRegExp( |
| "(?:(?:[0-3]?\\d/[01]?\\d)|" |
| "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")), |
| time_stamps_(regexp_factory_->CreateRegExp( |
| "[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d [0-2]\\d$")), |
| time_stamps_suffix_(regexp_factory_->CreateRegExp(":[0-5]\\d")), |
| matching_brackets_(regexp_factory_->CreateRegExp( |
| StrCat(leading_maybe_matched_bracket_, non_parens_, "+", |
| bracket_pairs_, non_parens_, "*"))), |
| group_separator_(regexp_factory_->CreateRegExp( |
| StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))), |
| capture_up_to_second_number_start_pattern_( |
| regexp_factory_->CreateRegExp( |
| PhoneNumberUtil::kCaptureUpToSecondNumberStart)), |
| capturing_ascii_digits_pattern_( |
| regexp_factory_->CreateRegExp("(\\d+)")), |
| lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)), |
| pattern_(regexp_factory_for_pattern_->CreateRegExp( |
| StrCat("(", opening_punctuation_, lead_limit_, |
| digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", |
| block_limit_, optional_extn_pattern_, ")"))) { |
| } |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps); |
| }; |
| |
| class AlternateFormats : public Singleton<AlternateFormats> { |
| public: |
| PhoneMetadataCollection format_data_; |
| |
| map<int, const PhoneMetadata*> calling_code_to_alternate_formats_map_; |
| |
| AlternateFormats() |
| : format_data_(), |
| calling_code_to_alternate_formats_map_() { |
| if (!LoadAlternateFormats(&format_data_)) { |
| LOG(DFATAL) << "Could not parse compiled-in metadata."; |
| return; |
| } |
| for (RepeatedPtrField<PhoneMetadata>::const_iterator it = |
| format_data_.metadata().begin(); |
| it != format_data_.metadata().end(); |
| ++it) { |
| calling_code_to_alternate_formats_map_.insert( |
| make_pair(it->country_code(), &*it)); |
| } |
| } |
| |
| const PhoneMetadata* GetAlternateFormatsForCountry(int country_calling_code) |
| const { |
| map<int, const PhoneMetadata*>::const_iterator it = |
| calling_code_to_alternate_formats_map_.find(country_calling_code); |
| if (it != calling_code_to_alternate_formats_map_.end()) { |
| return it->second; |
| } |
| return NULL; |
| } |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(AlternateFormats); |
| }; |
| |
| PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util, |
| const string& text, |
| const string& region_code, |
| PhoneNumberMatcher::Leniency leniency, |
| int max_tries) |
| : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), |
| alternate_formats_(AlternateFormats::GetInstance()), |
| phone_util_(util), |
| text_(text), |
| preferred_region_(region_code), |
| leniency_(leniency), |
| max_tries_(max_tries), |
| state_(NOT_READY), |
| last_match_(NULL), |
| search_index_(0) { |
| } |
| |
| PhoneNumberMatcher::PhoneNumberMatcher(const string& text, |
| const string& region_code) |
| : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), |
| alternate_formats_(NULL), // Not used. |
| phone_util_(*PhoneNumberUtil::GetInstance()), |
| text_(text), |
| preferred_region_(region_code), |
| leniency_(VALID), |
| max_tries_(numeric_limits<int>::max()), |
| state_(NOT_READY), |
| last_match_(NULL), |
| search_index_(0) { |
| } |
| |
| PhoneNumberMatcher::~PhoneNumberMatcher() { |
| } |
| |
| // static |
| bool PhoneNumberMatcher::IsLatinLetter(char32 letter) { |
| // Combining marks are a subset of non-spacing-mark. |
| if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) { |
| return false; |
| } |
| UBlockCode block = ublock_getCode(letter); |
| return ((block == UBLOCK_BASIC_LATIN) || |
| (block == UBLOCK_LATIN_1_SUPPLEMENT) || |
| (block == UBLOCK_LATIN_EXTENDED_A) || |
| (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) || |
| (block == UBLOCK_LATIN_EXTENDED_B) || |
| (block == UBLOCK_COMBINING_DIACRITICAL_MARKS)); |
| } |
| |
| bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset, |
| PhoneNumberMatch* match) { |
| DCHECK(match); |
| // Check the candidate doesn't contain any formatting which would indicate |
| // that it really isn't a phone number. |
| if (!reg_exps_->matching_brackets_->FullMatch(candidate)) { |
| return false; |
| } |
| |
| // If leniency is set to VALID or stricter, we also want to skip numbers that |
| // are surrounded by Latin alphabetic characters, to skip cases like |
| // abc8005001234 or 8005001234def. |
| if (leniency_ >= VALID) { |
| // If the candidate is not at the start of the text, and does not start with |
| // phone-number punctuation, check the previous character. |
| scoped_ptr<RegExpInput> candidate_input( |
| reg_exps_->regexp_factory_->CreateInput(candidate)); |
| if (offset > 0 && |
| !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) { |
| char32 previous_char; |
| const char* previous_char_ptr = |
| EncodingUtils::BackUpOneUTF8Character(text_.c_str(), |
| text_.c_str() + offset); |
| EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char); |
| // We return false if it is a latin letter or an invalid punctuation |
| // symbol. |
| if (IsInvalidPunctuationSymbol(previous_char) || |
| IsLatinLetter(previous_char)) { |
| return false; |
| } |
| } |
| size_t lastCharIndex = offset + candidate.length(); |
| if (lastCharIndex < text_.length()) { |
| char32 next_char; |
| const char* next_char_ptr = |
| EncodingUtils::AdvanceOneUTF8Character( |
| text_.c_str() + lastCharIndex - 1); |
| EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char); |
| if (IsInvalidPunctuationSymbol(next_char) || IsLatinLetter(next_char)) { |
| return false; |
| } |
| } |
| } |
| |
| PhoneNumber number; |
| if (phone_util_.ParseAndKeepRawInput(candidate, preferred_region_, &number) != |
| PhoneNumberUtil::NO_PARSING_ERROR) { |
| return false; |
| } |
| if (VerifyAccordingToLeniency(leniency_, number, candidate)) { |
| match->set_start(offset); |
| match->set_raw_string(candidate); |
| // We used ParseAndKeepRawInput to create this number, but for now we don't |
| // return the extra values parsed. TODO: stop clearing all values here and |
| // switch all users over to using raw_input() rather than the raw_string() |
| // of PhoneNumberMatch. |
| number.clear_country_code_source(); |
| number.clear_preferred_domestic_carrier_code(); |
| number.clear_raw_input(); |
| match->set_number(number); |
| return true; |
| } |
| return false; |
| } |
| |
| // Helper method to replace the verification method for each enum in the Java |
| // version. |
| bool PhoneNumberMatcher::VerifyAccordingToLeniency( |
| Leniency leniency, const PhoneNumber& number, |
| const string& candidate) const { |
| switch (leniency) { |
| case PhoneNumberMatcher::POSSIBLE: |
| return phone_util_.IsPossibleNumber(number); |
| case PhoneNumberMatcher::VALID: |
| if (!phone_util_.IsValidNumber(number) || |
| !ContainsOnlyValidXChars(number, candidate, phone_util_)) { |
| return false; |
| } |
| return IsNationalPrefixPresentIfRequired(number); |
| case PhoneNumberMatcher::STRICT_GROUPING: { |
| if (!phone_util_.IsValidNumber(number) || |
| !ContainsOnlyValidXChars(number, candidate, phone_util_) || |
| ContainsMoreThanOneSlashInNationalNumber( |
| number, candidate, phone_util_) || |
| !IsNationalPrefixPresentIfRequired(number)) { |
| return false; |
| } |
| ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, |
| const string&, const vector<string>&>* callback = |
| NewPermanentCallback(&AllNumberGroupsRemainGrouped); |
| bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); |
| delete(callback); |
| return is_valid; |
| } |
| case PhoneNumberMatcher::EXACT_GROUPING: { |
| if (!phone_util_.IsValidNumber(number) || |
| !ContainsOnlyValidXChars(number, candidate, phone_util_) || |
| ContainsMoreThanOneSlashInNationalNumber( |
| number, candidate, phone_util_) || |
| !IsNationalPrefixPresentIfRequired(number)) { |
| return false; |
| } |
| ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, |
| const string&, const vector<string>&>* callback = |
| NewPermanentCallback( |
| this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent); |
| bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); |
| delete(callback); |
| return is_valid; |
| } |
| default: |
| LOG(ERROR) << "No implementation defined for verification for leniency " |
| << static_cast<int>(leniency); |
| return false; |
| } |
| } |
| |
| bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset, |
| PhoneNumberMatch* match) { |
| DCHECK(match); |
| // Try removing either the first or last "group" in the number and see if this |
| // gives a result. We consider white space to be a possible indication of |
| // the start or end of the phone number. |
| scoped_ptr<RegExpInput> candidate_input( |
| reg_exps_->regexp_factory_->CreateInput(candidate)); |
| if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), |
| NULL)) { |
| // Try the first group by itself. |
| int group_start_index = |
| candidate.length() - candidate_input->ToString().length(); |
| string first_group_only = candidate.substr(0, group_start_index); |
| phone_util_.TrimUnwantedEndChars(&first_group_only); |
| bool success = ParseAndVerify(first_group_only, offset, match); |
| if (success) { |
| return true; |
| } |
| --max_tries_; |
| |
| // Try the rest of the candidate without the first group. |
| string without_first_group(candidate_input->ToString()); |
| phone_util_.TrimUnwantedEndChars(&without_first_group); |
| success = |
| ParseAndVerify(without_first_group, offset + group_start_index, match); |
| if (success) { |
| return true; |
| } |
| --max_tries_; |
| |
| if (max_tries_ > 0) { |
| while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), |
| NULL)) { |
| // Find the last group. |
| } |
| int last_group_start = |
| candidate.length() - candidate_input->ToString().length(); |
| string without_last_group = candidate.substr(0, last_group_start); |
| phone_util_.TrimUnwantedEndChars(&without_last_group); |
| if (without_last_group == first_group_only) { |
| // If there are only two groups, then the group "without the last group" |
| // is the same as the first group. In these cases, we don't want to |
| // re-check the number group, so we exit already. |
| return false; |
| } |
| success = ParseAndVerify(without_last_group, offset, match); |
| if (success) { |
| return true; |
| } |
| --max_tries_; |
| } |
| } |
| return false; |
| } |
| |
| bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset, |
| PhoneNumberMatch* match) { |
| DCHECK(match); |
| // Skip a match that is more likely a publication page reference or a date. |
| if (reg_exps_->pub_pages_->PartialMatch(candidate) || |
| reg_exps_->slash_separated_dates_->PartialMatch(candidate)) { |
| return false; |
| } |
| // Skip potential time-stamps. |
| if (reg_exps_->time_stamps_->PartialMatch(candidate)) { |
| scoped_ptr<RegExpInput> following_text( |
| reg_exps_->regexp_factory_->CreateInput( |
| text_.substr(offset + candidate.size()))); |
| if (reg_exps_->time_stamps_suffix_->Consume(following_text.get())) { |
| return false; |
| } |
| } |
| |
| // Try to come up with a valid match given the entire candidate. |
| if (ParseAndVerify(candidate, offset, match)) { |
| return true; |
| } |
| |
| // If that failed, try to find an "inner match" - there might be a phone |
| // number within this candidate. |
| return ExtractInnerMatch(candidate, offset, match); |
| } |
| |
| bool PhoneNumberMatcher::HasNext() { |
| if (state_ == NOT_READY) { |
| PhoneNumberMatch temp_match; |
| if (!Find(search_index_, &temp_match)) { |
| state_ = DONE; |
| } else { |
| last_match_.reset(new PhoneNumberMatch(temp_match.start(), |
| temp_match.raw_string(), |
| temp_match.number())); |
| search_index_ = last_match_->end(); |
| state_ = READY; |
| } |
| } |
| return state_ == READY; |
| } |
| |
| bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) { |
| DCHECK(match); |
| // Check the state and find the next match as a side-effect if necessary. |
| if (!HasNext()) { |
| return false; |
| } |
| match->CopyFrom(*last_match_); |
| state_ = NOT_READY; |
| last_match_.reset(NULL); |
| return true; |
| } |
| |
| bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) { |
| DCHECK(match); |
| |
| scoped_ptr<RegExpInput> text( |
| reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index))); |
| string candidate; |
| while ((max_tries_ > 0) && |
| reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) { |
| int start = text_.length() - text->ToString().length() - candidate.length(); |
| // Check for extra numbers at the end. |
| reg_exps_->capture_up_to_second_number_start_pattern_-> |
| PartialMatch(candidate, &candidate); |
| if (ExtractMatch(candidate, start, match)) { |
| return true; |
| } |
| |
| index = start + candidate.length(); |
| --max_tries_; |
| } |
| return false; |
| } |
| |
| bool PhoneNumberMatcher::CheckNumberGroupingIsValid( |
| const PhoneNumber& phone_number, |
| const string& candidate, |
| ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&, |
| const string&, const vector<string>&>* checker) const { |
| DCHECK(checker); |
| // TODO: Evaluate how this works for other locales (testing has been limited |
| // to NANPA regions) and optimise if necessary. |
| string normalized_candidate = |
| NormalizeUTF8::NormalizeDecimalDigits(candidate); |
| vector<string> formatted_number_groups; |
| GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern |
| &formatted_number_groups); |
| if (checker->Run(phone_util_, phone_number, normalized_candidate, |
| formatted_number_groups)) { |
| return true; |
| } |
| // If this didn't pass, see if there are any alternate formats, and try them |
| // instead. |
| const PhoneMetadata* alternate_formats = |
| alternate_formats_->GetAlternateFormatsForCountry( |
| phone_number.country_code()); |
| if (alternate_formats) { |
| for (RepeatedPtrField<NumberFormat>::const_iterator it = |
| alternate_formats->number_format().begin(); |
| it != alternate_formats->number_format().end(); ++it) { |
| formatted_number_groups.clear(); |
| GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups); |
| if (checker->Run(phone_util_, phone_number, normalized_candidate, |
| formatted_number_groups)) { |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| // Helper method to get the national-number part of a number, formatted without |
| // any national prefix, and return it as a set of digit blocks that would be |
| // formatted together. |
| void PhoneNumberMatcher::GetNationalNumberGroups( |
| const PhoneNumber& number, |
| const NumberFormat* formatting_pattern, |
| vector<string>* digit_blocks) const { |
| string rfc3966_format; |
| if (!formatting_pattern) { |
| // This will be in the format +CC-DG;ext=EXT where DG represents groups of |
| // digits. |
| phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); |
| // We remove the extension part from the formatted string before splitting |
| // it into different groups. |
| size_t end_index = rfc3966_format.find(';'); |
| if (end_index == string::npos) { |
| end_index = rfc3966_format.length(); |
| } |
| // The country-code will have a '-' following it. |
| size_t start_index = rfc3966_format.find('-') + 1; |
| SplitStringUsing(rfc3966_format.substr(start_index, |
| end_index - start_index), |
| "-", digit_blocks); |
| } else { |
| // We format the NSN only, and split that according to the separator. |
| string national_significant_number; |
| phone_util_.GetNationalSignificantNumber(number, |
| &national_significant_number); |
| phone_util_.FormatNsnUsingPattern(national_significant_number, |
| *formatting_pattern, |
| PhoneNumberUtil::RFC3966, |
| &rfc3966_format); |
| SplitStringUsing(rfc3966_format, "-", digit_blocks); |
| } |
| } |
| |
| bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired( |
| const PhoneNumber& number) const { |
| // First, check how we deduced the country code. If it was written in |
| // international format, then the national prefix is not required. |
| if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) { |
| return true; |
| } |
| string phone_number_region; |
| phone_util_.GetRegionCodeForCountryCode( |
| number.country_code(), &phone_number_region); |
| const PhoneMetadata* metadata = |
| phone_util_.GetMetadataForRegion(phone_number_region); |
| if (!metadata) { |
| return true; |
| } |
| // Check if a national prefix should be present when formatting this number. |
| string national_number; |
| phone_util_.GetNationalSignificantNumber(number, &national_number); |
| const NumberFormat* format_rule = |
| phone_util_.ChooseFormattingPatternForNumber(metadata->number_format(), |
| national_number); |
| // To do this, we check that a national prefix formatting rule was present and |
| // that it wasn't just the first-group symbol ($1) with punctuation. |
| if (format_rule && !format_rule->national_prefix_formatting_rule().empty()) { |
| if (format_rule->national_prefix_optional_when_formatting()) { |
| // The national-prefix is optional in these cases, so we don't need to |
| // check if it was present. |
| return true; |
| } |
| if (phone_util_.FormattingRuleHasFirstGroupOnly( |
| format_rule->national_prefix_formatting_rule())) { |
| // National Prefix not needed for this number. |
| return true; |
| } |
| // Normalize the remainder. |
| string raw_input_copy(number.raw_input()); |
| // Check if we found a national prefix and/or carrier code at the start of |
| // the raw input, and return the result. |
| phone_util_.NormalizeDigitsOnly(&raw_input_copy); |
| return phone_util_.MaybeStripNationalPrefixAndCarrierCode( |
| *metadata, |
| &raw_input_copy, |
| NULL); // Don't need to keep the stripped carrier code. |
| } |
| return true; |
| } |
| |
| bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent( |
| const PhoneNumberUtil& util, |
| const PhoneNumber& phone_number, |
| const string& normalized_candidate, |
| const vector<string>& formatted_number_groups) const { |
| const scoped_ptr<RegExpInput> candidate_number( |
| reg_exps_->regexp_factory_->CreateInput(normalized_candidate)); |
| vector<string> candidate_groups; |
| string digit_block; |
| while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume( |
| candidate_number.get(), |
| &digit_block)) { |
| candidate_groups.push_back(digit_block); |
| } |
| |
| // Set this to the last group, skipping it if the number has an extension. |
| int candidate_number_group_index = |
| phone_number.has_extension() ? candidate_groups.size() - 2 |
| : candidate_groups.size() - 1; |
| // First we check if the national significant number is formatted as a block. |
| // We use find and not equals, since the national significant number may be |
| // present with a prefix such as a national number prefix, or the country code |
| // itself. |
| string national_significant_number; |
| util.GetNationalSignificantNumber(phone_number, |
| &national_significant_number); |
| if (candidate_groups.size() == 1 || |
| candidate_groups.at(candidate_number_group_index).find( |
| national_significant_number) != string::npos) { |
| return true; |
| } |
| // Starting from the end, go through in reverse, excluding the first group, |
| // and check the candidate and number groups are the same. |
| for (int formatted_number_group_index = |
| (formatted_number_groups.size() - 1); |
| formatted_number_group_index > 0 && |
| candidate_number_group_index >= 0; |
| --formatted_number_group_index, --candidate_number_group_index) { |
| if (candidate_groups.at(candidate_number_group_index) != |
| formatted_number_groups.at(formatted_number_group_index)) { |
| return false; |
| } |
| } |
| // Now check the first group. There may be a national prefix at the start, so |
| // we only check that the candidate group ends with the formatted number |
| // group. |
| return (candidate_number_group_index >= 0 && |
| HasSuffixString(candidate_groups.at(candidate_number_group_index), |
| formatted_number_groups.at(0))); |
| } |
| |
| // static |
| bool PhoneNumberMatcher::ContainsMoreThanOneSlashInNationalNumber( |
| const PhoneNumber& number, |
| const string& candidate, |
| const PhoneNumberUtil& util) { |
| size_t first_slash_in_body = candidate.find('/'); |
| if (first_slash_in_body == string::npos) { |
| // No slashes, this is okay. |
| return false; |
| } |
| // Now look for a second one. |
| size_t second_slash_in_body = candidate.find('/', first_slash_in_body + 1); |
| if (second_slash_in_body == string::npos) { |
| // Only one slash, this is okay. |
| return false; |
| } |
| |
| // If the first slash is after the country calling code, this is permitted. |
| if (number.country_code_source() == PhoneNumber::FROM_NUMBER_WITH_PLUS_SIGN || |
| number.country_code_source() == |
| PhoneNumber::FROM_NUMBER_WITHOUT_PLUS_SIGN) { |
| string normalized_country_code = |
| candidate.substr(0, first_slash_in_body); |
| util.NormalizeDigitsOnly(&normalized_country_code); |
| if (normalized_country_code == SimpleItoa(number.country_code())) { |
| // Any more slashes and this is illegal. |
| return candidate.find('/', second_slash_in_body + 1) != string::npos; |
| } |
| } |
| return true; |
| } |
| |
| } // namespace phonenumbers |
| } // namespace i18n |