| // Copyright 2013 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "components/url_matcher/substring_set_matcher.h" |
| |
| #include <algorithm> |
| #include <queue> |
| |
| #include "base/logging.h" |
| #include "base/stl_util.h" |
| |
| namespace url_matcher { |
| |
| namespace { |
| |
| // Compare StringPattern instances based on their string patterns. |
| bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
| return a->pattern() < b->pattern(); |
| } |
| |
| // Given the set of patterns, compute how many nodes will the corresponding |
| // Aho-Corasick tree have. Note that |patterns| need to be sorted. |
| uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { |
| uint32 result = 1u; // 1 for the root node. |
| if (patterns.empty()) |
| return result; |
| |
| std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
| std::vector<const StringPattern*>::const_iterator current = last + 1; |
| // For the first pattern, each letter is a label of an edge to a new node. |
| result += (*last)->pattern().size(); |
| |
| // For the subsequent patterns, only count the edges which were not counted |
| // yet. For this it suffices to test against the previous pattern, because the |
| // patterns are sorted. |
| for (; current != patterns.end(); ++last, ++current) { |
| const std::string& last_pattern = (*last)->pattern(); |
| const std::string& current_pattern = (*current)->pattern(); |
| const uint32 prefix_bound = |
| std::min(last_pattern.size(), current_pattern.size()); |
| |
| uint32 common_prefix = 0; |
| while (common_prefix < prefix_bound && |
| last_pattern[common_prefix] == current_pattern[common_prefix]) |
| ++common_prefix; |
| result += current_pattern.size() - common_prefix; |
| } |
| return result; |
| } |
| |
| } // namespace |
| |
| // |
| // SubstringSetMatcher |
| // |
| |
| SubstringSetMatcher::SubstringSetMatcher() { |
| RebuildAhoCorasickTree(SubstringPatternVector()); |
| } |
| |
| SubstringSetMatcher::~SubstringSetMatcher() {} |
| |
| void SubstringSetMatcher::RegisterPatterns( |
| const std::vector<const StringPattern*>& patterns) { |
| RegisterAndUnregisterPatterns(patterns, |
| std::vector<const StringPattern*>()); |
| } |
| |
| void SubstringSetMatcher::UnregisterPatterns( |
| const std::vector<const StringPattern*>& patterns) { |
| RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(), |
| patterns); |
| } |
| |
| void SubstringSetMatcher::RegisterAndUnregisterPatterns( |
| const std::vector<const StringPattern*>& to_register, |
| const std::vector<const StringPattern*>& to_unregister) { |
| // Register patterns. |
| for (std::vector<const StringPattern*>::const_iterator i = |
| to_register.begin(); i != to_register.end(); ++i) { |
| DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
| patterns_[(*i)->id()] = *i; |
| } |
| |
| // Unregister patterns |
| for (std::vector<const StringPattern*>::const_iterator i = |
| to_unregister.begin(); i != to_unregister.end(); ++i) { |
| patterns_.erase((*i)->id()); |
| } |
| |
| // Now we compute the total number of tree nodes needed. |
| SubstringPatternVector sorted_patterns; |
| sorted_patterns.resize(patterns_.size()); |
| |
| size_t next = 0; |
| for (SubstringPatternMap::const_iterator i = patterns_.begin(); |
| i != patterns_.end(); |
| ++i, ++next) { |
| sorted_patterns[next] = i->second; |
| } |
| |
| std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
| tree_.reserve(TreeSize(sorted_patterns)); |
| |
| RebuildAhoCorasickTree(sorted_patterns); |
| } |
| |
| bool SubstringSetMatcher::Match(const std::string& text, |
| std::set<StringPattern::ID>* matches) const { |
| const size_t old_number_of_matches = matches->size(); |
| |
| // Handle patterns matching the empty string. |
| matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| |
| uint32 current_node = 0; |
| for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
| uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
| while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
| current_node != 0) { |
| current_node = tree_[current_node].failure(); |
| edge_from_current = tree_[current_node].GetEdge(*i); |
| } |
| if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| current_node = edge_from_current; |
| matches->insert(tree_[current_node].matches().begin(), |
| tree_[current_node].matches().end()); |
| } else { |
| DCHECK_EQ(0u, current_node); |
| } |
| } |
| |
| return old_number_of_matches != matches->size(); |
| } |
| |
| bool SubstringSetMatcher::IsEmpty() const { |
| // An empty tree consists of only the root node. |
| return patterns_.empty() && tree_.size() == 1u; |
| } |
| |
| void SubstringSetMatcher::RebuildAhoCorasickTree( |
| const SubstringPatternVector& sorted_patterns) { |
| tree_.clear(); |
| |
| // Initialize root note of tree. |
| AhoCorasickNode root; |
| root.set_failure(0); |
| tree_.push_back(root); |
| |
| // Insert all patterns. |
| for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); |
| i != sorted_patterns.end(); |
| ++i) { |
| InsertPatternIntoAhoCorasickTree(*i); |
| } |
| |
| CreateFailureEdges(); |
| } |
| |
| void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| const StringPattern* pattern) { |
| const std::string& text = pattern->pattern(); |
| const std::string::const_iterator text_end = text.end(); |
| |
| // Iterators on the tree and the text. |
| uint32 current_node = 0; |
| std::string::const_iterator i = text.begin(); |
| |
| // Follow existing paths for as long as possible. |
| while (i != text_end) { |
| uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
| if (edge_from_current == AhoCorasickNode::kNoSuchEdge) |
| break; |
| current_node = edge_from_current; |
| ++i; |
| } |
| |
| // Create new nodes if necessary. |
| while (i != text_end) { |
| tree_.push_back(AhoCorasickNode()); |
| tree_[current_node].SetEdge(*i, tree_.size() - 1); |
| current_node = tree_.size() - 1; |
| ++i; |
| } |
| |
| // Register match. |
| tree_[current_node].AddMatch(pattern->id()); |
| } |
| |
| void SubstringSetMatcher::CreateFailureEdges() { |
| typedef AhoCorasickNode::Edges Edges; |
| |
| std::queue<uint32> queue; |
| |
| AhoCorasickNode& root = tree_[0]; |
| root.set_failure(0); |
| const Edges& root_edges = root.edges(); |
| for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| ++e) { |
| const uint32& leads_to = e->second; |
| tree_[leads_to].set_failure(0); |
| queue.push(leads_to); |
| } |
| |
| while (!queue.empty()) { |
| AhoCorasickNode& current_node = tree_[queue.front()]; |
| queue.pop(); |
| for (Edges::const_iterator e = current_node.edges().begin(); |
| e != current_node.edges().end(); ++e) { |
| const char& edge_label = e->first; |
| const uint32& leads_to = e->second; |
| queue.push(leads_to); |
| |
| uint32 failure = current_node.failure(); |
| uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
| failure != 0) { |
| failure = tree_[failure].failure(); |
| edge_from_failure = tree_[failure].GetEdge(edge_label); |
| } |
| |
| const uint32 follow_in_case_of_failure = |
| edge_from_failure != AhoCorasickNode::kNoSuchEdge |
| ? edge_from_failure |
| : 0; |
| tree_[leads_to].set_failure(follow_in_case_of_failure); |
| tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| } |
| } |
| } |
| |
| const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; |
| |
| SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| : failure_(kNoSuchEdge) {} |
| |
| SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| |
| SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
| const SubstringSetMatcher::AhoCorasickNode& other) |
| : edges_(other.edges_), |
| failure_(other.failure_), |
| matches_(other.matches_) {} |
| |
| SubstringSetMatcher::AhoCorasickNode& |
| SubstringSetMatcher::AhoCorasickNode::operator=( |
| const SubstringSetMatcher::AhoCorasickNode& other) { |
| edges_ = other.edges_; |
| failure_ = other.failure_; |
| matches_ = other.matches_; |
| return *this; |
| } |
| |
| uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| Edges::const_iterator i = edges_.find(c); |
| return i == edges_.end() ? kNoSuchEdge : i->second; |
| } |
| |
| void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
| edges_[c] = node; |
| } |
| |
| void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
| matches_.insert(id); |
| } |
| |
| void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
| const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
| matches_.insert(matches.begin(), matches.end()); |
| } |
| |
| } // namespace url_matcher |