blob: cef13ca7df8a370168fa9b1c2c15c90d523e42c4 [file] [log] [blame]
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// PhishingTermFeatureExtractor handles computing term features from the text
// of a web page for the client-side phishing detection model. To do this, it
// takes a list of terms that appear in the model, and scans through the page
// text looking for them. Any terms that appear will cause a corresponding
// features::kPageTerm feature to be added to the FeatureMap.
//
// To make it harder for a phisher to enumerate all of the relevant terms in
// the model, the terms are provided as SHA-256 hashes, rather than plain text.
//
// There is one PhishingTermFeatureExtractor per RenderView.
#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
#include <string>
#include "base/basictypes.h"
#include "base/callback.h"
#include "base/containers/hash_tables.h"
#include "base/containers/mru_cache.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/weak_ptr.h"
#include "base/strings/string16.h"
#include "base/strings/string_piece.h"
namespace safe_browsing {
class FeatureExtractorClock;
class FeatureMap;
class PhishingTermFeatureExtractor {
public:
// Callback to be run when feature extraction finishes. The callback
// argument is true if extraction was successful, false otherwise.
typedef base::Callback<void(bool)> DoneCallback;
// Creates a PhishingTermFeatureExtractor which will extract features for
// all of the terms whose SHA-256 hashes are in |page_term_hashes|. These
// terms may be multi-word n-grams, with at most |max_words_per_term| words.
//
// |page_word_hashes| contains the murmur3 hashes for all of the individual
// words that make up the terms. Both sets of strings are UTF-8 encoded and
// lowercased prior to hashing. The caller owns both sets of strings, and
// must ensure that they are valid until the PhishingTermFeatureExtractor is
// destroyed.
//
// |clock| is used for timing feature extractor operations, and may be mocked
// for testing. The caller keeps ownership of the clock.
PhishingTermFeatureExtractor(
const base::hash_set<std::string>* page_term_hashes,
const base::hash_set<uint32>* page_word_hashes,
size_t max_words_per_term,
uint32 murmurhash3_seed,
FeatureExtractorClock* clock);
~PhishingTermFeatureExtractor();
// Begins extracting features from |page_text| into the given FeatureMap.
// |page_text| should contain the plain text of a web page, including any
// subframes, as returned by RenderView::CaptureText().
//
// To avoid blocking the render thread for too long, the feature extractor
// may run in several chunks of work, posting a task to the current
// MessageLoop to continue processing. Once feature extraction is complete,
// |done_callback| is run on the current thread.
// PhishingTermFeatureExtractor takes ownership of the callback.
//
// |page_text| and |features| are owned by the caller, and must not be
// destroyed until either |done_callback| is run or
// CancelPendingExtraction() is called.
void ExtractFeatures(const base::string16* page_text,
FeatureMap* features,
const DoneCallback& done_callback);
// Cancels any pending feature extraction. The DoneCallback will not be run.
// Must be called if there is a feature extraction in progress when the page
// is unloaded or the PhishingTermFeatureExtractor is destroyed.
void CancelPendingExtraction();
private:
struct ExtractionState;
// The maximum amount of wall time that we will spend on a single extraction
// iteration before pausing to let other MessageLoop tasks run.
static const int kMaxTimePerChunkMs;
// The number of words that we will process before checking to see whether
// kMaxTimePerChunkMs has elapsed. Since checking the current time can be
// slow, we don't do this on every word processed.
static const int kClockCheckGranularity;
// The maximum total amount of time that the feature extractor will run
// before giving up on the current page.
static const int kMaxTotalTimeMs;
// The size of the cache that we use to determine if we can avoid lower
// casing, hashing, and UTF conversion.
static const int kMaxNegativeWordCacheSize;
// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
// until a predefined maximum amount of time has elapsed, then posts a task
// to the current MessageLoop to continue extraction. When extraction
// finishes, calls RunCallback().
void ExtractFeaturesWithTimeout();
// Handles a single word in the page text.
void HandleWord(const base::StringPiece16& word);
// Helper to verify that there is no pending feature extraction. Dies in
// debug builds if the state is not as expected. This is a no-op in release
// builds.
void CheckNoPendingExtraction();
// Runs |done_callback_| and then clears all internal state.
void RunCallback(bool success);
// Clears all internal feature extraction state.
void Clear();
// All of the term hashes that we are looking for in the page.
const base::hash_set<std::string>* page_term_hashes_;
// Murmur3 hashes of all the individual words in page_term_hashes_. If
// page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
// would contain (hashed) "one" and "two". We do this so that we can have a
// quick out in the common case that the current word we are processing
// doesn't contain any part of one of our terms.
const base::hash_set<uint32>* page_word_hashes_;
// The maximum number of words in an n-gram.
const size_t max_words_per_term_;
// The seed for murmurhash3.
const uint32 murmurhash3_seed_;
// This cache is used to see if we need to check the word at all, as
// converting to UTF8, lowercasing, and hashing are all relatively expensive
// operations. Though this is called an MRU cache, it seems to behave like
// an LRU cache (i.e. it evicts the oldest accesses first).
typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
WordCache negative_word_cache_;
// Non-owned pointer to our clock.
FeatureExtractorClock* clock_;
// The output parameters from the most recent call to ExtractFeatures().
const base::string16* page_text_; // The caller keeps ownership of this.
FeatureMap* features_; // The caller keeps ownership of this.
DoneCallback done_callback_;
// Stores the current state of term extraction from |page_text_|.
scoped_ptr<ExtractionState> state_;
// Used in scheduling ExtractFeaturesWithTimeout tasks.
// These pointers are invalidated if extraction is cancelled.
base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;
DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
};
} // namespace safe_browsing
#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_