chrome/renderer/safe_browsing/phishing_classifier_delegate.cc - platform/external/chromium_org - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"

 #include <set>

 #include "base/bind.h"
 #include "base/callback.h"
 #include "base/lazy_instance.h"
 #include "base/logging.h"
 #include "base/metrics/histogram.h"
 #include "chrome/common/safe_browsing/csd.pb.h"
 #include "chrome/common/safe_browsing/safebrowsing_messages.h"
 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
 #include "chrome/renderer/safe_browsing/phishing_classifier.h"
 #include "chrome/renderer/safe_browsing/scorer.h"
 #include "content/public/renderer/document_state.h"
 #include "content/public/renderer/navigation_state.h"
 #include "content/public/renderer/render_thread.h"
 #include "content/public/renderer/render_view.h"
 #include "third_party/WebKit/public/platform/WebURL.h"
 #include "third_party/WebKit/public/web/WebDocument.h"
 #include "third_party/WebKit/public/web/WebFrame.h"
 #include "third_party/WebKit/public/web/WebView.h"

 using content::DocumentState;
 using content::NavigationState;
 using content::RenderThread;

 namespace safe_browsing {

 static GURL StripRef(const GURL& url) {
   GURL::Replacements replacements;
   replacements.ClearRef();
   return url.ReplaceComponents(replacements);
 }

 typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
 static base::LazyInstance<PhishingClassifierDelegates>
     g_delegates = LAZY_INSTANCE_INITIALIZER;

 static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
     g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;

 // static
 PhishingClassifierFilter* PhishingClassifierFilter::Create() {
   // Private constructor and public static Create() method to facilitate
   // stubbing out this class for binary-size reduction purposes.
   return new PhishingClassifierFilter();
 }

 PhishingClassifierFilter::PhishingClassifierFilter()
     : RenderProcessObserver() {}

 PhishingClassifierFilter::~PhishingClassifierFilter() {}

 bool PhishingClassifierFilter::OnControlMessageReceived(
     const IPC::Message& message) {
   bool handled = true;
   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
     IPC_MESSAGE_UNHANDLED(handled = false)
   IPC_END_MESSAGE_MAP()
   return handled;
 }

 void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
   safe_browsing::Scorer* scorer = NULL;
   // An empty model string means we should disable client-side phishing
   // detection.
   if (!model.empty()) {
     scorer = safe_browsing::Scorer::Create(model);
     if (!scorer) {
       DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
       return;
     }
   }
   PhishingClassifierDelegates::iterator i;
   for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
     (*i)->SetPhishingScorer(scorer);
   }
   g_phishing_scorer.Get().reset(scorer);
 }

 // static
 PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
     content::RenderView* render_view, PhishingClassifier* classifier) {
   // Private constructor and public static Create() method to facilitate
   // stubbing out this class for binary-size reduction purposes.
   return new PhishingClassifierDelegate(render_view, classifier);
 }

 PhishingClassifierDelegate::PhishingClassifierDelegate(
     content::RenderView* render_view,
     PhishingClassifier* classifier)
     : content::RenderViewObserver(render_view),
       last_main_frame_transition_(content::PAGE_TRANSITION_LINK),
       have_page_text_(false),
       is_classifying_(false) {
   g_delegates.Get().insert(this);
   if (!classifier) {
     classifier = new PhishingClassifier(render_view,
                                         new FeatureExtractorClock());
   }

   classifier_.reset(classifier);

   if (g_phishing_scorer.Get().get())
     SetPhishingScorer(g_phishing_scorer.Get().get());
 }

 PhishingClassifierDelegate::~PhishingClassifierDelegate() {
   CancelPendingClassification(SHUTDOWN);
   g_delegates.Get().erase(this);
 }

 void PhishingClassifierDelegate::SetPhishingScorer(
     const safe_browsing::Scorer* scorer) {
   if (!render_view()->GetWebView())
     return;  // RenderView is tearing down.
   if (is_classifying_) {
     // If there is a classification going on right now it means we're
     // actually replacing an existing scorer with a new model.  In
     // this case we simply cancel the current classification.
     // TODO(noelutz): if this happens too frequently we could also
     // replace the old scorer with the new one once classification is done
     // but this would complicate the code somewhat.
     CancelPendingClassification(NEW_PHISHING_SCORER);
   }
   classifier_->set_phishing_scorer(scorer);
   // Start classifying the current page if all conditions are met.
   // See MaybeStartClassification() for details.
   MaybeStartClassification();
 }

 void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
   last_url_received_from_browser_ = StripRef(url);
   // Start classifying the current page if all conditions are met.
   // See MaybeStartClassification() for details.
   MaybeStartClassification();
 }

 void PhishingClassifierDelegate::DidCommitProvisionalLoad(
     WebKit::WebFrame* frame, bool is_new_navigation) {
   // A new page is starting to load, so cancel classificaiton.
   //
   // TODO(bryner): We shouldn't need to cancel classification if the navigation
   // is within the same page.  However, if we let classification continue in
   // this case, we need to properly deal with the fact that PageCaptured will
   // be called again for the in-page navigation.  We need to be sure not to
   // swap out the page text while the term feature extractor is still running.
   DocumentState* document_state = DocumentState::FromDataSource(
       frame->dataSource());
   NavigationState* navigation_state = document_state->navigation_state();
   CancelPendingClassification(navigation_state->was_within_same_page() ?
                               NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY);
   if (frame == render_view()->GetWebView()->mainFrame()) {
     last_main_frame_transition_ = navigation_state->transition_type();
   }
 }

 void PhishingClassifierDelegate::PageCaptured(string16* page_text,
                                               bool preliminary_capture) {
   if (preliminary_capture) {
     return;
   }
   // Make sure there's no classification in progress.  We don't want to swap
   // out the page text string from underneath the term feature extractor.
   //
   // Note: Currently, if the url hasn't changed, we won't restart
   // classification in this case.  We may want to adjust this.
   CancelPendingClassification(PAGE_RECAPTURED);
   last_finished_load_url_ = GetToplevelUrl();
   classifier_page_text_.swap(*page_text);
   have_page_text_ = true;
   MaybeStartClassification();
 }

 void PhishingClassifierDelegate::CancelPendingClassification(
     CancelClassificationReason reason) {
   if (is_classifying_) {
     UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
                               reason,
                               CANCEL_CLASSIFICATION_MAX);
     is_classifying_ = false;
   }
   if (classifier_->is_ready()) {
     classifier_->CancelPendingClassification();
   }
   classifier_page_text_.clear();
   have_page_text_ = false;
 }

 bool PhishingClassifierDelegate::OnMessageReceived(
     const IPC::Message& message) {
   bool handled = true;
   IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
     IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
                         OnStartPhishingDetection)
     IPC_MESSAGE_UNHANDLED(handled = false)
   IPC_END_MESSAGE_MAP()
   return handled;
 }

 void PhishingClassifierDelegate::ClassificationDone(
     const ClientPhishingRequest& verdict) {
   // We no longer need the page text.
   classifier_page_text_.clear();
   VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
           << " score = " << verdict.client_score();
   if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
     DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
     RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
         routing_id(), verdict.SerializeAsString()));
   }
 }

 GURL PhishingClassifierDelegate::GetToplevelUrl() {
   return render_view()->GetWebView()->mainFrame()->document().url();
 }

 void PhishingClassifierDelegate::MaybeStartClassification() {
   // We can begin phishing classification when the following conditions are
   // met:
   //  1. A Scorer has been created
   //  2. The browser has sent a StartPhishingDetection message for the current
   //     toplevel URL.
   //  3. The page has finished loading and the page text has been extracted.
   //  4. The load is a new navigation (not a session history navigation).
   //  5. The toplevel URL has not already been classified.
   //
   // Note that if we determine that this particular navigation should not be
   // classified at all (as opposed to deferring it until we get an IPC or the
   // load completes), we discard the page text since it won't be needed.
   if (!classifier_->is_ready()) {
     VLOG(2) << "Not starting classification, no Scorer created.";
     // Keep classifier_page_text_, in case a Scorer is set later.
     return;
   }

   if (last_main_frame_transition_ & content::PAGE_TRANSITION_FORWARD_BACK) {
     // Skip loads from session history navigation.  However, update the
     // last URL sent to the classifier, so that we'll properly detect
     // in-page navigations.
     VLOG(2) << "Not starting classification for back/forward navigation";
     last_url_sent_to_classifier_ = last_finished_load_url_;
     classifier_page_text_.clear();  // we won't need this.
     have_page_text_ = false;
     return;
   }

   GURL stripped_last_load_url(StripRef(last_finished_load_url_));
   if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
     // We've already classified this toplevel URL, so this was likely an
     // in-page navigation or a subframe navigation.  The browser should not
     // send a StartPhishingDetection IPC in this case.
     VLOG(2) << "Toplevel URL is unchanged, not starting classification.";
     classifier_page_text_.clear();  // we won't need this.
     have_page_text_ = false;
     return;
   }

   if (!have_page_text_) {
     VLOG(2) << "Not starting classification, there is no page text ready.";
     return;
   }

   if (last_url_received_from_browser_ != stripped_last_load_url) {
     // The browser has not yet confirmed that this URL should be classified,
     // so defer classification for now.  Note: the ref does not affect
     // any of the browser's preclassification checks, so we don't require it
     // to match.
     VLOG(2) << "Not starting classification, last url from browser is "
             << last_url_received_from_browser_ << ", last finished load is "
             << last_finished_load_url_;
     // Keep classifier_page_text_, in case the browser notifies us later that
     // we should classify the URL.
     return;
   }

   VLOG(2) << "Starting classification for " << last_finished_load_url_;
   last_url_sent_to_classifier_ = last_finished_load_url_;
   is_classifying_ = true;
   classifier_->BeginClassification(
       &classifier_page_text_,
       base::Bind(&PhishingClassifierDelegate::ClassificationDone,
                  base::Unretained(this)));
 }

 }  // namespace safe_browsing
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "chrome/renderer/safe_browsing/phishing_classifier_delegate.h"

	#include <set>

	#include "base/bind.h"
	#include "base/callback.h"
	#include "base/lazy_instance.h"
	#include "base/logging.h"
	#include "base/metrics/histogram.h"
	#include "chrome/common/safe_browsing/csd.pb.h"
	#include "chrome/common/safe_browsing/safebrowsing_messages.h"
	#include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
	#include "chrome/renderer/safe_browsing/phishing_classifier.h"
	#include "chrome/renderer/safe_browsing/scorer.h"
	#include "content/public/renderer/document_state.h"
	#include "content/public/renderer/navigation_state.h"
	#include "content/public/renderer/render_thread.h"
	#include "content/public/renderer/render_view.h"
	#include "third_party/WebKit/public/platform/WebURL.h"
	#include "third_party/WebKit/public/web/WebDocument.h"
	#include "third_party/WebKit/public/web/WebFrame.h"
	#include "third_party/WebKit/public/web/WebView.h"

	using content::DocumentState;
	using content::NavigationState;
	using content::RenderThread;

	namespace safe_browsing {

	static GURL StripRef(const GURL& url) {
	GURL::Replacements replacements;
	replacements.ClearRef();
	return url.ReplaceComponents(replacements);
	}

	typedef std::set<PhishingClassifierDelegate*> PhishingClassifierDelegates;
	static base::LazyInstance<PhishingClassifierDelegates>
	g_delegates = LAZY_INSTANCE_INITIALIZER;

	static base::LazyInstance<scoped_ptr<const safe_browsing::Scorer> >
	g_phishing_scorer = LAZY_INSTANCE_INITIALIZER;

	// static
	PhishingClassifierFilter* PhishingClassifierFilter::Create() {
	// Private constructor and public static Create() method to facilitate
	// stubbing out this class for binary-size reduction purposes.
	return new PhishingClassifierFilter();
	}

	PhishingClassifierFilter::PhishingClassifierFilter()
	: RenderProcessObserver() {}

	PhishingClassifierFilter::~PhishingClassifierFilter() {}

	bool PhishingClassifierFilter::OnControlMessageReceived(
	const IPC::Message& message) {
	bool handled = true;
	IPC_BEGIN_MESSAGE_MAP(PhishingClassifierFilter, message)
	IPC_MESSAGE_HANDLER(SafeBrowsingMsg_SetPhishingModel, OnSetPhishingModel)
	IPC_MESSAGE_UNHANDLED(handled = false)
	IPC_END_MESSAGE_MAP()
	return handled;
	}

	void PhishingClassifierFilter::OnSetPhishingModel(const std::string& model) {
	safe_browsing::Scorer* scorer = NULL;
	// An empty model string means we should disable client-side phishing
	// detection.
	if (!model.empty()) {
	scorer = safe_browsing::Scorer::Create(model);
	if (!scorer) {
	DLOG(ERROR) << "Unable to create a PhishingScorer - corrupt model?";
	return;
	}
	}
	PhishingClassifierDelegates::iterator i;
	for (i = g_delegates.Get().begin(); i != g_delegates.Get().end(); ++i) {
	(*i)->SetPhishingScorer(scorer);
	}
	g_phishing_scorer.Get().reset(scorer);
	}

	// static
	PhishingClassifierDelegate* PhishingClassifierDelegate::Create(
	content::RenderView* render_view, PhishingClassifier* classifier) {
	// Private constructor and public static Create() method to facilitate
	// stubbing out this class for binary-size reduction purposes.
	return new PhishingClassifierDelegate(render_view, classifier);
	}

	PhishingClassifierDelegate::PhishingClassifierDelegate(
	content::RenderView* render_view,
	PhishingClassifier* classifier)
	: content::RenderViewObserver(render_view),
	last_main_frame_transition_(content::PAGE_TRANSITION_LINK),
	have_page_text_(false),
	is_classifying_(false) {
	g_delegates.Get().insert(this);
	if (!classifier) {
	classifier = new PhishingClassifier(render_view,
	new FeatureExtractorClock());
	}

	classifier_.reset(classifier);

	if (g_phishing_scorer.Get().get())
	SetPhishingScorer(g_phishing_scorer.Get().get());
	}

	PhishingClassifierDelegate::~PhishingClassifierDelegate() {
	CancelPendingClassification(SHUTDOWN);
	g_delegates.Get().erase(this);
	}

	void PhishingClassifierDelegate::SetPhishingScorer(
	const safe_browsing::Scorer* scorer) {
	if (!render_view()->GetWebView())
	return; // RenderView is tearing down.
	if (is_classifying_) {
	// If there is a classification going on right now it means we're
	// actually replacing an existing scorer with a new model. In
	// this case we simply cancel the current classification.
	// TODO(noelutz): if this happens too frequently we could also
	// replace the old scorer with the new one once classification is done
	// but this would complicate the code somewhat.
	CancelPendingClassification(NEW_PHISHING_SCORER);
	}
	classifier_->set_phishing_scorer(scorer);
	// Start classifying the current page if all conditions are met.
	// See MaybeStartClassification() for details.
	MaybeStartClassification();
	}

	void PhishingClassifierDelegate::OnStartPhishingDetection(const GURL& url) {
	last_url_received_from_browser_ = StripRef(url);
	// Start classifying the current page if all conditions are met.
	// See MaybeStartClassification() for details.
	MaybeStartClassification();
	}

	void PhishingClassifierDelegate::DidCommitProvisionalLoad(
	WebKit::WebFrame* frame, bool is_new_navigation) {
	// A new page is starting to load, so cancel classificaiton.
	//
	// TODO(bryner): We shouldn't need to cancel classification if the navigation
	// is within the same page. However, if we let classification continue in
	// this case, we need to properly deal with the fact that PageCaptured will
	// be called again for the in-page navigation. We need to be sure not to
	// swap out the page text while the term feature extractor is still running.
	DocumentState* document_state = DocumentState::FromDataSource(
	frame->dataSource());
	NavigationState* navigation_state = document_state->navigation_state();
	CancelPendingClassification(navigation_state->was_within_same_page() ?
	NAVIGATE_WITHIN_PAGE : NAVIGATE_AWAY);
	if (frame == render_view()->GetWebView()->mainFrame()) {
	last_main_frame_transition_ = navigation_state->transition_type();
	}
	}

	void PhishingClassifierDelegate::PageCaptured(string16* page_text,
	bool preliminary_capture) {
	if (preliminary_capture) {
	return;
	}
	// Make sure there's no classification in progress. We don't want to swap
	// out the page text string from underneath the term feature extractor.
	//
	// Note: Currently, if the url hasn't changed, we won't restart
	// classification in this case. We may want to adjust this.
	CancelPendingClassification(PAGE_RECAPTURED);
	last_finished_load_url_ = GetToplevelUrl();
	classifier_page_text_.swap(*page_text);
	have_page_text_ = true;
	MaybeStartClassification();
	}

	void PhishingClassifierDelegate::CancelPendingClassification(
	CancelClassificationReason reason) {
	if (is_classifying_) {
	UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.CancelClassificationReason",
	reason,
	CANCEL_CLASSIFICATION_MAX);
	is_classifying_ = false;
	}
	if (classifier_->is_ready()) {
	classifier_->CancelPendingClassification();
	}
	classifier_page_text_.clear();
	have_page_text_ = false;
	}

	bool PhishingClassifierDelegate::OnMessageReceived(
	const IPC::Message& message) {
	bool handled = true;
	IPC_BEGIN_MESSAGE_MAP(PhishingClassifierDelegate, message)
	IPC_MESSAGE_HANDLER(SafeBrowsingMsg_StartPhishingDetection,
	OnStartPhishingDetection)
	IPC_MESSAGE_UNHANDLED(handled = false)
	IPC_END_MESSAGE_MAP()
	return handled;
	}

	void PhishingClassifierDelegate::ClassificationDone(
	const ClientPhishingRequest& verdict) {
	// We no longer need the page text.
	classifier_page_text_.clear();
	VLOG(2) << "Phishy verdict = " << verdict.is_phishing()
	<< " score = " << verdict.client_score();
	if (verdict.client_score() != PhishingClassifier::kInvalidScore) {
	DCHECK_EQ(last_url_sent_to_classifier_.spec(), verdict.url());
	RenderThread::Get()->Send(new SafeBrowsingHostMsg_PhishingDetectionDone(
	routing_id(), verdict.SerializeAsString()));
	}
	}

	GURL PhishingClassifierDelegate::GetToplevelUrl() {
	return render_view()->GetWebView()->mainFrame()->document().url();
	}

	void PhishingClassifierDelegate::MaybeStartClassification() {
	// We can begin phishing classification when the following conditions are
	// met:
	// 1. A Scorer has been created
	// 2. The browser has sent a StartPhishingDetection message for the current
	// toplevel URL.
	// 3. The page has finished loading and the page text has been extracted.
	// 4. The load is a new navigation (not a session history navigation).
	// 5. The toplevel URL has not already been classified.
	//
	// Note that if we determine that this particular navigation should not be
	// classified at all (as opposed to deferring it until we get an IPC or the
	// load completes), we discard the page text since it won't be needed.
	if (!classifier_->is_ready()) {
	VLOG(2) << "Not starting classification, no Scorer created.";
	// Keep classifier_page_text_, in case a Scorer is set later.
	return;
	}

	if (last_main_frame_transition_ & content::PAGE_TRANSITION_FORWARD_BACK) {
	// Skip loads from session history navigation. However, update the
	// last URL sent to the classifier, so that we'll properly detect
	// in-page navigations.
	VLOG(2) << "Not starting classification for back/forward navigation";
	last_url_sent_to_classifier_ = last_finished_load_url_;
	classifier_page_text_.clear(); // we won't need this.
	have_page_text_ = false;
	return;
	}

	GURL stripped_last_load_url(StripRef(last_finished_load_url_));
	if (stripped_last_load_url == StripRef(last_url_sent_to_classifier_)) {
	// We've already classified this toplevel URL, so this was likely an
	// in-page navigation or a subframe navigation. The browser should not
	// send a StartPhishingDetection IPC in this case.
	VLOG(2) << "Toplevel URL is unchanged, not starting classification.";
	classifier_page_text_.clear(); // we won't need this.
	have_page_text_ = false;
	return;
	}

	if (!have_page_text_) {
	VLOG(2) << "Not starting classification, there is no page text ready.";
	return;
	}

	if (last_url_received_from_browser_ != stripped_last_load_url) {
	// The browser has not yet confirmed that this URL should be classified,
	// so defer classification for now. Note: the ref does not affect
	// any of the browser's preclassification checks, so we don't require it
	// to match.
	VLOG(2) << "Not starting classification, last url from browser is "
	<< last_url_received_from_browser_ << ", last finished load is "
	<< last_finished_load_url_;
	// Keep classifier_page_text_, in case the browser notifies us later that
	// we should classify the URL.
	return;
	}

	VLOG(2) << "Starting classification for " << last_finished_load_url_;
	last_url_sent_to_classifier_ = last_finished_load_url_;
	is_classifying_ = true;
	classifier_->BeginClassification(
	&classifier_page_text_,
	base::Bind(&PhishingClassifierDelegate::ClassificationDone,
	base::Unretained(this)));
	}

	} // namespace safe_browsing