blob: d8bc37a3ca032d310ffe3c3b65063004ed02df2b [file] [log] [blame]
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
#include "re2/re2.h"
DEFINE_int32(filtered_re2_min_atom_len,
3,
"Strings less than this length are not stored as atoms");
namespace re2 {
PrefilterTree::PrefilterTree()
: compiled_(false) {
}
PrefilterTree::~PrefilterTree() {
for (int i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (int i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
// Functions used for adding and Compiling prefilters to the
// PrefilterTree.
static bool KeepPart(Prefilter* prefilter, int level) {
if (prefilter == NULL)
return false;
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepPart: "
<< prefilter->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return prefilter->atom().size() >=
FLAGS_filtered_re2_min_atom_len;
case Prefilter::AND: {
int j = 0;
vector<Prefilter*>* subs = prefilter->subs();
for (int i = 0; i < subs->size(); i++)
if (KeepPart((*subs)[i], level + 1))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (int i = 0; i < prefilter->subs()->size(); i++)
if (!KeepPart((*prefilter->subs())[i], level + 1))
return false;
return true;
}
}
void PrefilterTree::Add(Prefilter *f) {
if (compiled_) {
LOG(DFATAL) << "Add after Compile.";
return;
}
if (f != NULL && !KeepPart(f, 0)) {
delete f;
f = NULL;
}
prefilter_vec_.push_back(f);
}
void PrefilterTree::Compile(vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile after Compile.";
return;
}
// We do this check to support some legacy uses of
// PrefilterTree that call Compile before adding any regexps,
// and expect Compile not to have effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
AssignUniqueIds(atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (int i = 0; i < entries_.size(); i++) {
IntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
have_other_guard = have_other_guard &&
(entries_[it->index()].propagate_up_at_count > 1);
if (have_other_guard) {
for (IntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->index()].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
PrintDebugInfo();
}
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
string node_string = NodeString(node);
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
if (iter == node_map_.end())
return NULL;
return (*iter).second;
}
static string Itoa(int n) {
char buf[100];
snprintf(buf, sizeof buf, "%d", n);
return string(buf);
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = Itoa(node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
s += ',';
s += Itoa((*node->subs())[i]->unique_id());
}
}
return s;
}
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (int i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(i);
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (int i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const vector<Prefilter*>& subs = *f->subs();
for (int j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
node_map_[NodeString(node)] = node;
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(node_map_.size());
// Create parent IntMap for the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new IntMap(node_map_.size());
}
// Fill the entries.
for (int i = v.size() - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
IntMap uniq_child(node_map_.size());
for (int j = 0; j < prefilter->subs()->size() ; j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
if (!uniq_child.has_index(child_id))
uniq_child.set_new(child_id, 1);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (!child_entry->parents->has_index(prefilter->unique_id()))
child_entry->parents->set_new(prefilter->unique_id(), 1);
}
entry->propagate_up_at_count =
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (int i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(i);
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const vector<int>& matched_atoms,
vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(WARNING) << "Compile() not called";
for (int i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(i);
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(prefilter_vec_.size());
vector<int> matched_atom_ids;
for (int j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(entries_.size());
IntMap work(entries_.size());
for (int i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
VLOG(10) << "Processing: " << it->index();
// Record regexps triggered.
for (int i = 0; i < entry.regexps.size(); i++) {
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
regexps->set(entry.regexps[i], 1);
}
int c;
// Pass trigger up to parents.
for (IntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->index();
const Entry& parent = entries_[j];
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
VLOG(10) << "Triggering: " << j;
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo() {
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
VLOG(10) << "#Unique Nodes: " << entries_.size();
for (int i = 0; i < entries_.size(); ++i) {
IntMap* parents = entries_[i].parents;
const vector<int>& regexps = entries_[i].regexps;
VLOG(10) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
VLOG(10) << it->index();
}
VLOG(10) << "Map:";
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
iter != node_map_.end(); ++iter)
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
string PrefilterTree::DebugNodeString(Prefilter* node) const {
string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (int i = 0; i < node->subs()->size() ; i++) {
if (i > 0)
node_string += ',';
node_string += Itoa((*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2