chrome/renderer/safe_browsing/phishing_term_feature_extractor.h - platform/external/chromium_org - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
 // PhishingTermFeatureExtractor handles computing term features from the text
 // of a web page for the client-side phishing detection model.  To do this, it
 // takes a list of terms that appear in the model, and scans through the page
 // text looking for them.  Any terms that appear will cause a corresponding
 // features::kPageTerm feature to be added to the FeatureMap.
 //
 // To make it harder for a phisher to enumerate all of the relevant terms in
 // the model, the terms are provided as SHA-256 hashes, rather than plain text.
 //
 // There is one PhishingTermFeatureExtractor per RenderView.

 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

 #include <string>

 #include "base/basictypes.h"
 #include "base/callback.h"
 #include "base/containers/hash_tables.h"
 #include "base/containers/mru_cache.h"
 #include "base/memory/scoped_ptr.h"
 #include "base/memory/weak_ptr.h"
 #include "base/strings/string16.h"
 #include "base/strings/string_piece.h"

 namespace safe_browsing {
 class FeatureExtractorClock;
 class FeatureMap;

 class PhishingTermFeatureExtractor {
  public:
   // Callback to be run when feature extraction finishes.  The callback
   // argument is true if extraction was successful, false otherwise.
   typedef base::Callback<void(bool)> DoneCallback;

   // Creates a PhishingTermFeatureExtractor which will extract features for
   // all of the terms whose SHA-256 hashes are in |page_term_hashes|.  These
   // terms may be multi-word n-grams, with at most |max_words_per_term| words.
   //
   // |page_word_hashes| contains the murmur3 hashes for all of the individual
   // words that make up the terms.  Both sets of strings are UTF-8 encoded and
   // lowercased prior to hashing.  The caller owns both sets of strings, and
   // must ensure that they are valid until the PhishingTermFeatureExtractor is
   // destroyed.
   //
   // |clock| is used for timing feature extractor operations, and may be mocked
   // for testing.  The caller keeps ownership of the clock.
   PhishingTermFeatureExtractor(
       const base::hash_set<std::string>* page_term_hashes,
       const base::hash_set<uint32>* page_word_hashes,
       size_t max_words_per_term,
       uint32 murmurhash3_seed,
       FeatureExtractorClock* clock);
   ~PhishingTermFeatureExtractor();

   // Begins extracting features from |page_text| into the given FeatureMap.
   // |page_text| should contain the plain text of a web page, including any
   // subframes, as returned by RenderView::CaptureText().
   //
   // To avoid blocking the render thread for too long, the feature extractor
   // may run in several chunks of work, posting a task to the current
   // MessageLoop to continue processing.  Once feature extraction is complete,
   // |done_callback| is run on the current thread.
   // PhishingTermFeatureExtractor takes ownership of the callback.
   //
   // |page_text| and |features| are owned by the caller, and must not be
   // destroyed until either |done_callback| is run or
   // CancelPendingExtraction() is called.
   void ExtractFeatures(const base::string16* page_text,
                        FeatureMap* features,
                        const DoneCallback& done_callback);

   // Cancels any pending feature extraction.  The DoneCallback will not be run.
   // Must be called if there is a feature extraction in progress when the page
   // is unloaded or the PhishingTermFeatureExtractor is destroyed.
   void CancelPendingExtraction();

  private:
   struct ExtractionState;

   // The maximum amount of wall time that we will spend on a single extraction
   // iteration before pausing to let other MessageLoop tasks run.
   static const int kMaxTimePerChunkMs;

   // The number of words that we will process before checking to see whether
   // kMaxTimePerChunkMs has elapsed.  Since checking the current time can be
   // slow, we don't do this on every word processed.
   static const int kClockCheckGranularity;

   // The maximum total amount of time that the feature extractor will run
   // before giving up on the current page.
   static const int kMaxTotalTimeMs;

   // The size of the cache that we use to determine if we can avoid lower
   // casing, hashing, and UTF conversion.
   static const int kMaxNegativeWordCacheSize;

   // Does the actual work of ExtractFeatures.  ExtractFeaturesWithTimeout runs
   // until a predefined maximum amount of time has elapsed, then posts a task
   // to the current MessageLoop to continue extraction.  When extraction
   // finishes, calls RunCallback().
   void ExtractFeaturesWithTimeout();

   // Handles a single word in the page text.
   void HandleWord(const base::StringPiece16& word);

   // Helper to verify that there is no pending feature extraction.  Dies in
   // debug builds if the state is not as expected.  This is a no-op in release
   // builds.
   void CheckNoPendingExtraction();

   // Runs |done_callback_| and then clears all internal state.
   void RunCallback(bool success);

   // Clears all internal feature extraction state.
   void Clear();

   // All of the term hashes that we are looking for in the page.
   const base::hash_set<std::string>* page_term_hashes_;

   // Murmur3 hashes of all the individual words in page_term_hashes_.  If
   // page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
   // would contain (hashed) "one" and "two".  We do this so that we can have a
   // quick out in the common case that the current word we are processing
   // doesn't contain any part of one of our terms.
   const base::hash_set<uint32>* page_word_hashes_;

   // The maximum number of words in an n-gram.
   const size_t max_words_per_term_;

   // The seed for murmurhash3.
   const uint32 murmurhash3_seed_;

   // This cache is used to see if we need to check the word at all, as
   // converting to UTF8, lowercasing, and hashing are all relatively expensive
   // operations. Though this is called an MRU cache, it seems to behave like
   // an LRU cache (i.e. it evicts the oldest accesses first).
   typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
   WordCache negative_word_cache_;

   // Non-owned pointer to our clock.
   FeatureExtractorClock* clock_;

   // The output parameters from the most recent call to ExtractFeatures().
   const base::string16* page_text_;  // The caller keeps ownership of this.
   FeatureMap* features_;  // The caller keeps ownership of this.
   DoneCallback done_callback_;

   // Stores the current state of term extraction from |page_text_|.
   scoped_ptr<ExtractionState> state_;

   // Used in scheduling ExtractFeaturesWithTimeout tasks.
   // These pointers are invalidated if extraction is cancelled.
   base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;

   DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
 };

 }  // namespace safe_browsing

 #endif  // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	//
	// PhishingTermFeatureExtractor handles computing term features from the text
	// of a web page for the client-side phishing detection model. To do this, it
	// takes a list of terms that appear in the model, and scans through the page
	// text looking for them. Any terms that appear will cause a corresponding
	// features::kPageTerm feature to be added to the FeatureMap.
	//
	// To make it harder for a phisher to enumerate all of the relevant terms in
	// the model, the terms are provided as SHA-256 hashes, rather than plain text.
	//
	// There is one PhishingTermFeatureExtractor per RenderView.

	#ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_
	#define CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_

	#include <string>

	#include "base/basictypes.h"
	#include "base/callback.h"
	#include "base/containers/hash_tables.h"
	#include "base/containers/mru_cache.h"
	#include "base/memory/scoped_ptr.h"
	#include "base/memory/weak_ptr.h"
	#include "base/strings/string16.h"
	#include "base/strings/string_piece.h"

	namespace safe_browsing {
	class FeatureExtractorClock;
	class FeatureMap;

	class PhishingTermFeatureExtractor {
	public:
	// Callback to be run when feature extraction finishes. The callback
	// argument is true if extraction was successful, false otherwise.
	typedef base::Callback<void(bool)> DoneCallback;

	// Creates a PhishingTermFeatureExtractor which will extract features for
	// all of the terms whose SHA-256 hashes are in \|page_term_hashes\|. These
	// terms may be multi-word n-grams, with at most \|max_words_per_term\| words.
	//
	// \|page_word_hashes\| contains the murmur3 hashes for all of the individual
	// words that make up the terms. Both sets of strings are UTF-8 encoded and
	// lowercased prior to hashing. The caller owns both sets of strings, and
	// must ensure that they are valid until the PhishingTermFeatureExtractor is
	// destroyed.
	//
	// \|clock\| is used for timing feature extractor operations, and may be mocked
	// for testing. The caller keeps ownership of the clock.
	PhishingTermFeatureExtractor(
	const base::hash_set<std::string>* page_term_hashes,
	const base::hash_set<uint32>* page_word_hashes,
	size_t max_words_per_term,
	uint32 murmurhash3_seed,
	FeatureExtractorClock* clock);
	~PhishingTermFeatureExtractor();

	// Begins extracting features from \|page_text\| into the given FeatureMap.
	// \|page_text\| should contain the plain text of a web page, including any
	// subframes, as returned by RenderView::CaptureText().
	//
	// To avoid blocking the render thread for too long, the feature extractor
	// may run in several chunks of work, posting a task to the current
	// MessageLoop to continue processing. Once feature extraction is complete,
	// \|done_callback\| is run on the current thread.
	// PhishingTermFeatureExtractor takes ownership of the callback.
	//
	// \|page_text\| and \|features\| are owned by the caller, and must not be
	// destroyed until either \|done_callback\| is run or
	// CancelPendingExtraction() is called.
	void ExtractFeatures(const base::string16* page_text,
	FeatureMap* features,
	const DoneCallback& done_callback);

	// Cancels any pending feature extraction. The DoneCallback will not be run.
	// Must be called if there is a feature extraction in progress when the page
	// is unloaded or the PhishingTermFeatureExtractor is destroyed.
	void CancelPendingExtraction();

	private:
	struct ExtractionState;

	// The maximum amount of wall time that we will spend on a single extraction
	// iteration before pausing to let other MessageLoop tasks run.
	static const int kMaxTimePerChunkMs;

	// The number of words that we will process before checking to see whether
	// kMaxTimePerChunkMs has elapsed. Since checking the current time can be
	// slow, we don't do this on every word processed.
	static const int kClockCheckGranularity;

	// The maximum total amount of time that the feature extractor will run
	// before giving up on the current page.
	static const int kMaxTotalTimeMs;

	// The size of the cache that we use to determine if we can avoid lower
	// casing, hashing, and UTF conversion.
	static const int kMaxNegativeWordCacheSize;

	// Does the actual work of ExtractFeatures. ExtractFeaturesWithTimeout runs
	// until a predefined maximum amount of time has elapsed, then posts a task
	// to the current MessageLoop to continue extraction. When extraction
	// finishes, calls RunCallback().
	void ExtractFeaturesWithTimeout();

	// Handles a single word in the page text.
	void HandleWord(const base::StringPiece16& word);

	// Helper to verify that there is no pending feature extraction. Dies in
	// debug builds if the state is not as expected. This is a no-op in release
	// builds.
	void CheckNoPendingExtraction();

	// Runs \|done_callback_\| and then clears all internal state.
	void RunCallback(bool success);

	// Clears all internal feature extraction state.
	void Clear();

	// All of the term hashes that we are looking for in the page.
	const base::hash_set<std::string>* page_term_hashes_;

	// Murmur3 hashes of all the individual words in page_term_hashes_. If
	// page_term_hashes_ included (hashed) "one" and "one two", page_word_hashes_
	// would contain (hashed) "one" and "two". We do this so that we can have a
	// quick out in the common case that the current word we are processing
	// doesn't contain any part of one of our terms.
	const base::hash_set<uint32>* page_word_hashes_;

	// The maximum number of words in an n-gram.
	const size_t max_words_per_term_;

	// The seed for murmurhash3.
	const uint32 murmurhash3_seed_;

	// This cache is used to see if we need to check the word at all, as
	// converting to UTF8, lowercasing, and hashing are all relatively expensive
	// operations. Though this is called an MRU cache, it seems to behave like
	// an LRU cache (i.e. it evicts the oldest accesses first).
	typedef base::HashingMRUCache<base::StringPiece16, bool> WordCache;
	WordCache negative_word_cache_;

	// Non-owned pointer to our clock.
	FeatureExtractorClock* clock_;

	// The output parameters from the most recent call to ExtractFeatures().
	const base::string16* page_text_; // The caller keeps ownership of this.
	FeatureMap* features_; // The caller keeps ownership of this.
	DoneCallback done_callback_;

	// Stores the current state of term extraction from \|page_text_\|.
	scoped_ptr<ExtractionState> state_;

	// Used in scheduling ExtractFeaturesWithTimeout tasks.
	// These pointers are invalidated if extraction is cancelled.
	base::WeakPtrFactory<PhishingTermFeatureExtractor> weak_factory_;

	DISALLOW_COPY_AND_ASSIGN(PhishingTermFeatureExtractor);
	};

	} // namespace safe_browsing

	#endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_TERM_FEATURE_EXTRACTOR_H_