content/browser/speech/google_streaming_remote_engine.cc - platform/external/chromium_org - Git at Google

 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "content/browser/speech/google_streaming_remote_engine.h"

 #include <vector>

 #include "base/bind.h"
 #include "base/rand_util.h"
 #include "base/strings/string_number_conversions.h"
 #include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "base/time/time.h"
 #include "content/browser/speech/audio_buffer.h"
 #include "content/browser/speech/proto/google_streaming_api.pb.h"
 #include "content/public/common/speech_recognition_error.h"
 #include "content/public/common/speech_recognition_result.h"
 #include "google_apis/google_api_keys.h"
 #include "net/base/escape.h"
 #include "net/base/load_flags.h"
 #include "net/url_request/http_user_agent_settings.h"
 #include "net/url_request/url_fetcher.h"
 #include "net/url_request/url_request_context.h"
 #include "net/url_request/url_request_context_getter.h"
 #include "net/url_request/url_request_status.h"

 using net::URLFetcher;

 namespace content {
 namespace {

 const char kWebServiceBaseUrl[] =
     "https://www.google.com/speech-api/full-duplex/v1";
 const char kDownstreamUrl[] = "/down?";
 const char kUpstreamUrl[] = "/up?";
 const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;

 // This matches the maximum maxAlternatives value supported by the server.
 const uint32 kMaxMaxAlternatives = 30;

 // TODO(hans): Remove this and other logging when we don't need it anymore.
 void DumpResponse(const std::string& response) {
   DVLOG(1) << "------------";
   proto::SpeechRecognitionEvent event;
   if (!event.ParseFromString(response)) {
     DVLOG(1) << "Parse failed!";
     return;
   }
   if (event.has_status())
     DVLOG(1) << "STATUS\t" << event.status();
   for (int i = 0; i < event.result_size(); ++i) {
     DVLOG(1) << "RESULT #" << i << ":";
     const proto::SpeechRecognitionResult& res = event.result(i);
     if (res.has_final())
       DVLOG(1) << "  FINAL:\t" << res.final();
     if (res.has_stability())
       DVLOG(1) << "  STABILITY:\t" << res.stability();
     for (int j = 0; j < res.alternative_size(); ++j) {
       const proto::SpeechRecognitionAlternative& alt =
           res.alternative(j);
       if (alt.has_confidence())
         DVLOG(1) << "    CONFIDENCE:\t" << alt.confidence();
       if (alt.has_transcript())
         DVLOG(1) << "    TRANSCRIPT:\t" << alt.transcript();
     }
   }
 }

 }  // namespace

 const int GoogleStreamingRemoteEngine::kAudioPacketIntervalMs = 100;
 const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting = 0;
 const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting = 1;
 const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0;
 const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5;

 GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine(
     net::URLRequestContextGetter* context)
     : url_context_(context),
       previous_response_length_(0),
       got_last_definitive_result_(false),
       is_dispatching_event_(false),
       state_(STATE_IDLE) {}

 GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {}

 void GoogleStreamingRemoteEngine::SetConfig(
     const SpeechRecognitionEngineConfig& config) {
   config_ = config;
 }

 void GoogleStreamingRemoteEngine::StartRecognition() {
   FSMEventArgs event_args(EVENT_START_RECOGNITION);
   DispatchEvent(event_args);
 }

 void GoogleStreamingRemoteEngine::EndRecognition() {
   FSMEventArgs event_args(EVENT_END_RECOGNITION);
   DispatchEvent(event_args);
 }

 void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
   FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
   event_args.audio_data = &data;
   DispatchEvent(event_args);
 }

 void GoogleStreamingRemoteEngine::AudioChunksEnded() {
   FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
   DispatchEvent(event_args);
 }

 void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) {
   const bool kResponseComplete = true;
   DispatchHTTPResponse(source, kResponseComplete);
 }

 void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress(
     const URLFetcher* source, int64 current, int64 total) {
   const bool kPartialResponse = false;
   DispatchHTTPResponse(source, kPartialResponse);
 }

 void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source,
                                                        bool end_of_response) {
   DCHECK(CalledOnValidThread());
   DCHECK(source);
   const bool response_is_good = source->GetStatus().is_success() &&
                                 source->GetResponseCode() == 200;
   std::string response;
   if (response_is_good)
     source->GetResponseAsString(&response);
   const size_t current_response_length = response.size();

   DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream")
            << "HTTP, code: " << source->GetResponseCode()
            << "      length: " << current_response_length
            << "      eor: " << end_of_response;

   // URLFetcher provides always the entire response buffer, but we are only
   // interested in the fresh data introduced by the last chunk. Therefore, we
   // drop the previous content we have already processed.
   if (current_response_length != 0) {
     DCHECK_GE(current_response_length, previous_response_length_);
     response.erase(0, previous_response_length_);
     previous_response_length_ = current_response_length;
   }

   if (!response_is_good && source == downstream_fetcher_.get()) {
     DVLOG(1) << "Downstream error " << source->GetResponseCode();
     FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
     DispatchEvent(event_args);
     return;
   }
   if (!response_is_good && source == upstream_fetcher_.get()) {
     DVLOG(1) << "Upstream error " << source->GetResponseCode()
              << " EOR " << end_of_response;
     FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
     DispatchEvent(event_args);
     return;
   }

   // Ignore incoming data on the upstream connection.
   if (source == upstream_fetcher_.get())
     return;

   DCHECK(response_is_good && source == downstream_fetcher_.get());

   // The downstream response is organized in chunks, whose size is determined
   // by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
   // Such chunks are sent by the speech recognition webservice over the HTTP
   // downstream channel using HTTP chunked transfer (unrelated to our chunks).
   // This function is called every time an HTTP chunk is received by the
   // url fetcher. However there isn't any particular matching beween our
   // protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
   // contain a portion of one chunk or even more chunks together.
   chunked_byte_buffer_.Append(response);

   // A single HTTP chunk can contain more than one data chunk, thus the while.
   while (chunked_byte_buffer_.HasChunks()) {
     FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
     event_args.response = chunked_byte_buffer_.PopChunk();
     DCHECK(event_args.response.get());
     DumpResponse(std::string(event_args.response->begin(),
                              event_args.response->end()));
     DispatchEvent(event_args);
   }
   if (end_of_response) {
     FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
     DispatchEvent(event_args);
   }
 }

 bool GoogleStreamingRemoteEngine::IsRecognitionPending() const {
   DCHECK(CalledOnValidThread());
   return state_ != STATE_IDLE;
 }

 int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const {
   return kAudioPacketIntervalMs;
 }

 // -----------------------  Core FSM implementation ---------------------------

 void GoogleStreamingRemoteEngine::DispatchEvent(
     const FSMEventArgs& event_args) {
   DCHECK(CalledOnValidThread());
   DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
   DCHECK_LE(state_, STATE_MAX_VALUE);

   // Event dispatching must be sequential, otherwise it will break all the rules
   // and the assumptions of the finite state automata model.
   DCHECK(!is_dispatching_event_);
   is_dispatching_event_ = true;

   state_ = ExecuteTransitionAndGetNextState(event_args);

   is_dispatching_event_ = false;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState(
     const FSMEventArgs& event_args) {
   const FSMEvent event = event_args.event;
   switch (state_) {
     case STATE_IDLE:
       switch (event) {
         case EVENT_START_RECOGNITION:
           return ConnectBothStreams(event_args);
         case EVENT_END_RECOGNITION:
         // Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
         // abort, so we just silently drop them here.
         case EVENT_AUDIO_CHUNK:
         case EVENT_AUDIO_CHUNKS_ENDED:
         // DOWNSTREAM_CLOSED can be received if we end up here due to an error.
         case EVENT_DOWNSTREAM_CLOSED:
           return DoNothing(event_args);
         case EVENT_UPSTREAM_ERROR:
         case EVENT_DOWNSTREAM_ERROR:
         case EVENT_DOWNSTREAM_RESPONSE:
           return NotFeasible(event_args);
       }
       break;
     case STATE_BOTH_STREAMS_CONNECTED:
       switch (event) {
         case EVENT_AUDIO_CHUNK:
           return TransmitAudioUpstream(event_args);
         case EVENT_DOWNSTREAM_RESPONSE:
           return ProcessDownstreamResponse(event_args);
         case EVENT_AUDIO_CHUNKS_ENDED:
           return CloseUpstreamAndWaitForResults(event_args);
         case EVENT_END_RECOGNITION:
           return AbortSilently(event_args);
         case EVENT_UPSTREAM_ERROR:
         case EVENT_DOWNSTREAM_ERROR:
         case EVENT_DOWNSTREAM_CLOSED:
           return AbortWithError(event_args);
         case EVENT_START_RECOGNITION:
           return NotFeasible(event_args);
       }
       break;
     case STATE_WAITING_DOWNSTREAM_RESULTS:
       switch (event) {
         case EVENT_DOWNSTREAM_RESPONSE:
           return ProcessDownstreamResponse(event_args);
         case EVENT_DOWNSTREAM_CLOSED:
           return RaiseNoMatchErrorIfGotNoResults(event_args);
         case EVENT_END_RECOGNITION:
           return AbortSilently(event_args);
         case EVENT_UPSTREAM_ERROR:
         case EVENT_DOWNSTREAM_ERROR:
           return AbortWithError(event_args);
         case EVENT_START_RECOGNITION:
         case EVENT_AUDIO_CHUNK:
         case EVENT_AUDIO_CHUNKS_ENDED:
           return NotFeasible(event_args);
       }
       break;
   }
   return NotFeasible(event_args);
 }

 // ----------- Contract for all the FSM evolution functions below -------------
 //  - Are guaranteed to be executed in the same thread (IO, except for tests);
 //  - Are guaranteed to be not reentrant (themselves and each other);
 //  - event_args members are guaranteed to be stable during the call;

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
   DCHECK(!upstream_fetcher_.get());
   DCHECK(!downstream_fetcher_.get());

   encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
                                       config_.audio_sample_rate,
                                       config_.audio_num_bits_per_sample));
   DCHECK(encoder_.get());
   const std::string request_key = GenerateRequestKey();

   // Setup downstream fetcher.
   std::vector<std::string> downstream_args;
   downstream_args.push_back(
       "key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
   downstream_args.push_back("pair=" + request_key);
   downstream_args.push_back("output=pb");
   GURL downstream_url(std::string(kWebServiceBaseUrl) +
                       std::string(kDownstreamUrl) +
                       JoinString(downstream_args, '&'));

   downstream_fetcher_.reset(URLFetcher::Create(
       kDownstreamUrlFetcherIdForTesting, downstream_url, URLFetcher::GET,
       this));
   downstream_fetcher_->SetRequestContext(url_context_.get());
   downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
                                     net::LOAD_DO_NOT_SEND_COOKIES |
                                     net::LOAD_DO_NOT_SEND_AUTH_DATA);
   downstream_fetcher_->Start();

   // Setup upstream fetcher.
   // TODO(hans): Support for user-selected grammars.
   std::vector<std::string> upstream_args;
   upstream_args.push_back("key=" +
       net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
   upstream_args.push_back("pair=" + request_key);
   upstream_args.push_back("output=pb");
   upstream_args.push_back(
       "lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true));
   upstream_args.push_back(
       config_.filter_profanities ? "pFilter=2" : "pFilter=0");
   if (config_.max_hypotheses > 0U) {
     int max_alternatives = std::min(kMaxMaxAlternatives,
                                     config_.max_hypotheses);
     upstream_args.push_back("maxAlternatives=" +
                             base::UintToString(max_alternatives));
   }
   upstream_args.push_back("client=chromium");
   if (!config_.hardware_info.empty()) {
     upstream_args.push_back(
         "xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true));
   }
   if (config_.continuous)
     upstream_args.push_back("continuous");
   if (config_.interim_results)
     upstream_args.push_back("interim");

   GURL upstream_url(std::string(kWebServiceBaseUrl) +
                     std::string(kUpstreamUrl) +
                     JoinString(upstream_args, '&'));

   upstream_fetcher_.reset(URLFetcher::Create(
       kUpstreamUrlFetcherIdForTesting, upstream_url, URLFetcher::POST, this));
   upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
   upstream_fetcher_->SetRequestContext(url_context_.get());
   upstream_fetcher_->SetReferrer(config_.origin_url);
   upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
                                   net::LOAD_DO_NOT_SEND_COOKIES |
                                   net::LOAD_DO_NOT_SEND_AUTH_DATA);
   upstream_fetcher_->Start();
   previous_response_length_ = 0;
   return STATE_BOTH_STREAMS_CONNECTED;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::TransmitAudioUpstream(
     const FSMEventArgs& event_args) {
   DCHECK(upstream_fetcher_.get());
   DCHECK(event_args.audio_data.get());
   const AudioChunk& audio = *(event_args.audio_data.get());

   DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
   encoder_->Encode(audio);
   scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
   upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
   return state_;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::ProcessDownstreamResponse(
     const FSMEventArgs& event_args) {
   DCHECK(event_args.response.get());

   proto::SpeechRecognitionEvent ws_event;
   if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
                                             event_args.response->end())))
     return AbortWithError(event_args);

   // An empty (default) event is used to notify us that the upstream has
   // been connected. Ignore.
   if (!ws_event.result_size() && (!ws_event.has_status() ||
       ws_event.status() == proto::SpeechRecognitionEvent::STATUS_SUCCESS)) {
     DVLOG(1) << "Received empty response";
     return state_;
   }

   if (ws_event.has_status()) {
     switch (ws_event.status()) {
       case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
         break;
       case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
         return Abort(SPEECH_RECOGNITION_ERROR_NO_SPEECH);
       case proto::SpeechRecognitionEvent::STATUS_ABORTED:
         return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
       case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
         return Abort(SPEECH_RECOGNITION_ERROR_AUDIO);
       case proto::SpeechRecognitionEvent::STATUS_NETWORK:
         return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
       case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
         // TODO(hans): We need a better error code for this.
         return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
       case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
         // TODO(hans): We need a better error code for this.
         return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
       case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
         return Abort(SPEECH_RECOGNITION_ERROR_BAD_GRAMMAR);
       case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
         // TODO(hans): We need a better error code for this.
         return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
     }
   }

   SpeechRecognitionResults results;
   for (int i = 0; i < ws_event.result_size(); ++i) {
     const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
     results.push_back(SpeechRecognitionResult());
     SpeechRecognitionResult& result = results.back();
     result.is_provisional = !(ws_result.has_final() && ws_result.final());

     if (!result.is_provisional)
       got_last_definitive_result_ = true;

     for (int j = 0; j < ws_result.alternative_size(); ++j) {
       const proto::SpeechRecognitionAlternative& ws_alternative =
           ws_result.alternative(j);
       SpeechRecognitionHypothesis hypothesis;
       if (ws_alternative.has_confidence())
         hypothesis.confidence = ws_alternative.confidence();
       else if (ws_result.has_stability())
         hypothesis.confidence = ws_result.stability();
       DCHECK(ws_alternative.has_transcript());
       // TODO(hans): Perhaps the transcript should be required in the proto?
       if (ws_alternative.has_transcript())
         hypothesis.utterance = base::UTF8ToUTF16(ws_alternative.transcript());

       result.hypotheses.push_back(hypothesis);
     }
   }

   delegate()->OnSpeechRecognitionEngineResults(results);

   return state_;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults(
     const FSMEventArgs& event_args) {
   if (!got_last_definitive_result_) {
     // Provide an empty result to notify that recognition is ended with no
     // errors, yet neither any further results.
     delegate()->OnSpeechRecognitionEngineResults(SpeechRecognitionResults());
   }
   return AbortSilently(event_args);
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults(
     const FSMEventArgs&) {
   DCHECK(upstream_fetcher_.get());
   DCHECK(encoder_.get());

   DVLOG(1) <<  "Closing upstream.";

   // The encoder requires a non-empty final buffer. So we encode a packet
   // of silence in case encoder had no data already.
   std::vector<short> samples(
       config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
   scoped_refptr<AudioChunk> dummy_chunk =
       new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
                      samples.size() * sizeof(short),
                      encoder_->bits_per_sample() / 8);
   encoder_->Encode(*dummy_chunk.get());
   encoder_->Flush();
   scoped_refptr<AudioChunk> encoded_dummy_data =
       encoder_->GetEncodedDataAndClear();
   DCHECK(!encoded_dummy_data->IsEmpty());
   encoder_.reset();

   upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
   got_last_definitive_result_ = false;
   return STATE_WAITING_DOWNSTREAM_RESULTS;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) {
   DCHECK(!upstream_fetcher_.get());
   DCHECK(downstream_fetcher_.get());

   DVLOG(1) <<  "Closing downstream.";
   downstream_fetcher_.reset();
   return STATE_IDLE;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) {
   return Abort(SPEECH_RECOGNITION_ERROR_NONE);
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) {
   return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
 }

 GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort(
     SpeechRecognitionErrorCode error_code) {
   DVLOG(1) << "Aborting with error " << error_code;

   if (error_code != SPEECH_RECOGNITION_ERROR_NONE) {
     delegate()->OnSpeechRecognitionEngineError(
         SpeechRecognitionError(error_code));
   }
   downstream_fetcher_.reset();
   upstream_fetcher_.reset();
   encoder_.reset();
   return STATE_IDLE;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) {
   return state_;
 }

 GoogleStreamingRemoteEngine::FSMState
 GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) {
   NOTREACHED() << "Unfeasible event " << event_args.event
                << " in state " << state_;
   return state_;
 }

 std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const {
   std::string langs = config_.language;
   if (langs.empty() && url_context_.get()) {
     // If no language is provided then we use the first from the accepted
     // language list. If this list is empty then it defaults to "en-US".
     // Example of the contents of this list: "es,en-GB;q=0.8", ""
     net::URLRequestContext* request_context =
         url_context_->GetURLRequestContext();
     DCHECK(request_context);
     // TODO(pauljensen): GoogleStreamingRemoteEngine should be constructed with
     // a reference to the HttpUserAgentSettings rather than accessing the
     // accept language through the URLRequestContext.
     if (request_context->http_user_agent_settings()) {
       std::string accepted_language_list =
           request_context->http_user_agent_settings()->GetAcceptLanguage();
       size_t separator = accepted_language_list.find_first_of(",;");
       if (separator != std::string::npos)
         langs = accepted_language_list.substr(0, separator);
     }
   }
   if (langs.empty())
     langs = "en-US";
   return langs;
 }

 // TODO(primiano): Is there any utility in the codebase that already does this?
 std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const {
   const int64 kKeepLowBytes = 0x00000000FFFFFFFFLL;
   const int64 kKeepHighBytes = 0xFFFFFFFF00000000LL;

   // Just keep the least significant bits of timestamp, in order to reduce
   // probability of collisions.
   int64 key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
               (base::RandUint64() & kKeepHighBytes);
   return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
 }

 GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
     : event(event_value) {
 }

 GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() {
 }

 }  // namespace content
	// Copyright (c) 2012 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "content/browser/speech/google_streaming_remote_engine.h"

	#include <vector>

	#include "base/bind.h"
	#include "base/rand_util.h"
	#include "base/strings/string_number_conversions.h"
	#include "base/strings/string_util.h"
	#include "base/strings/utf_string_conversions.h"
	#include "base/time/time.h"
	#include "content/browser/speech/audio_buffer.h"
	#include "content/browser/speech/proto/google_streaming_api.pb.h"
	#include "content/public/common/speech_recognition_error.h"
	#include "content/public/common/speech_recognition_result.h"
	#include "google_apis/google_api_keys.h"
	#include "net/base/escape.h"
	#include "net/base/load_flags.h"
	#include "net/url_request/http_user_agent_settings.h"
	#include "net/url_request/url_fetcher.h"
	#include "net/url_request/url_request_context.h"
	#include "net/url_request/url_request_context_getter.h"
	#include "net/url_request/url_request_status.h"

	using net::URLFetcher;

	namespace content {
	namespace {

	const char kWebServiceBaseUrl[] =
	"https://www.google.com/speech-api/full-duplex/v1";
	const char kDownstreamUrl[] = "/down?";
	const char kUpstreamUrl[] = "/up?";
	const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;

	// This matches the maximum maxAlternatives value supported by the server.
	const uint32 kMaxMaxAlternatives = 30;

	// TODO(hans): Remove this and other logging when we don't need it anymore.
	void DumpResponse(const std::string& response) {
	DVLOG(1) << "------------";
	proto::SpeechRecognitionEvent event;
	if (!event.ParseFromString(response)) {
	DVLOG(1) << "Parse failed!";
	return;
	}
	if (event.has_status())
	DVLOG(1) << "STATUS\t" << event.status();
	for (int i = 0; i < event.result_size(); ++i) {
	DVLOG(1) << "RESULT #" << i << ":";
	const proto::SpeechRecognitionResult& res = event.result(i);
	if (res.has_final())
	DVLOG(1) << " FINAL:\t" << res.final();
	if (res.has_stability())
	DVLOG(1) << " STABILITY:\t" << res.stability();
	for (int j = 0; j < res.alternative_size(); ++j) {
	const proto::SpeechRecognitionAlternative& alt =
	res.alternative(j);
	if (alt.has_confidence())
	DVLOG(1) << " CONFIDENCE:\t" << alt.confidence();
	if (alt.has_transcript())
	DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript();
	}
	}
	}

	} // namespace

	const int GoogleStreamingRemoteEngine::kAudioPacketIntervalMs = 100;
	const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting = 0;
	const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting = 1;
	const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0;
	const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5;

	GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine(
	net::URLRequestContextGetter* context)
	: url_context_(context),
	previous_response_length_(0),
	got_last_definitive_result_(false),
	is_dispatching_event_(false),
	state_(STATE_IDLE) {}

	GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {}

	void GoogleStreamingRemoteEngine::SetConfig(
	const SpeechRecognitionEngineConfig& config) {
	config_ = config;
	}

	void GoogleStreamingRemoteEngine::StartRecognition() {
	FSMEventArgs event_args(EVENT_START_RECOGNITION);
	DispatchEvent(event_args);
	}

	void GoogleStreamingRemoteEngine::EndRecognition() {
	FSMEventArgs event_args(EVENT_END_RECOGNITION);
	DispatchEvent(event_args);
	}

	void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
	FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
	event_args.audio_data = &data;
	DispatchEvent(event_args);
	}

	void GoogleStreamingRemoteEngine::AudioChunksEnded() {
	FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
	DispatchEvent(event_args);
	}

	void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) {
	const bool kResponseComplete = true;
	DispatchHTTPResponse(source, kResponseComplete);
	}

	void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress(
	const URLFetcher* source, int64 current, int64 total) {
	const bool kPartialResponse = false;
	DispatchHTTPResponse(source, kPartialResponse);
	}

	void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source,
	bool end_of_response) {
	DCHECK(CalledOnValidThread());
	DCHECK(source);
	const bool response_is_good = source->GetStatus().is_success() &&
	source->GetResponseCode() == 200;
	std::string response;
	if (response_is_good)
	source->GetResponseAsString(&response);
	const size_t current_response_length = response.size();

	DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream")
	<< "HTTP, code: " << source->GetResponseCode()
	<< " length: " << current_response_length
	<< " eor: " << end_of_response;

	// URLFetcher provides always the entire response buffer, but we are only
	// interested in the fresh data introduced by the last chunk. Therefore, we
	// drop the previous content we have already processed.
	if (current_response_length != 0) {
	DCHECK_GE(current_response_length, previous_response_length_);
	response.erase(0, previous_response_length_);
	previous_response_length_ = current_response_length;
	}

	if (!response_is_good && source == downstream_fetcher_.get()) {
	DVLOG(1) << "Downstream error " << source->GetResponseCode();
	FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
	DispatchEvent(event_args);
	return;
	}
	if (!response_is_good && source == upstream_fetcher_.get()) {
	DVLOG(1) << "Upstream error " << source->GetResponseCode()
	<< " EOR " << end_of_response;
	FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
	DispatchEvent(event_args);
	return;
	}

	// Ignore incoming data on the upstream connection.
	if (source == upstream_fetcher_.get())
	return;

	DCHECK(response_is_good && source == downstream_fetcher_.get());

	// The downstream response is organized in chunks, whose size is determined
	// by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
	// Such chunks are sent by the speech recognition webservice over the HTTP
	// downstream channel using HTTP chunked transfer (unrelated to our chunks).
	// This function is called every time an HTTP chunk is received by the
	// url fetcher. However there isn't any particular matching beween our
	// protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
	// contain a portion of one chunk or even more chunks together.
	chunked_byte_buffer_.Append(response);

	// A single HTTP chunk can contain more than one data chunk, thus the while.
	while (chunked_byte_buffer_.HasChunks()) {
	FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
	event_args.response = chunked_byte_buffer_.PopChunk();
	DCHECK(event_args.response.get());
	DumpResponse(std::string(event_args.response->begin(),
	event_args.response->end()));
	DispatchEvent(event_args);
	}
	if (end_of_response) {
	FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
	DispatchEvent(event_args);
	}
	}

	bool GoogleStreamingRemoteEngine::IsRecognitionPending() const {
	DCHECK(CalledOnValidThread());
	return state_ != STATE_IDLE;
	}

	int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const {
	return kAudioPacketIntervalMs;
	}

	// ----------------------- Core FSM implementation ---------------------------

	void GoogleStreamingRemoteEngine::DispatchEvent(
	const FSMEventArgs& event_args) {
	DCHECK(CalledOnValidThread());
	DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
	DCHECK_LE(state_, STATE_MAX_VALUE);

	// Event dispatching must be sequential, otherwise it will break all the rules
	// and the assumptions of the finite state automata model.
	DCHECK(!is_dispatching_event_);
	is_dispatching_event_ = true;

	state_ = ExecuteTransitionAndGetNextState(event_args);

	is_dispatching_event_ = false;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState(
	const FSMEventArgs& event_args) {
	const FSMEvent event = event_args.event;
	switch (state_) {
	case STATE_IDLE:
	switch (event) {
	case EVENT_START_RECOGNITION:
	return ConnectBothStreams(event_args);
	case EVENT_END_RECOGNITION:
	// Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
	// abort, so we just silently drop them here.
	case EVENT_AUDIO_CHUNK:
	case EVENT_AUDIO_CHUNKS_ENDED:
	// DOWNSTREAM_CLOSED can be received if we end up here due to an error.
	case EVENT_DOWNSTREAM_CLOSED:
	return DoNothing(event_args);
	case EVENT_UPSTREAM_ERROR:
	case EVENT_DOWNSTREAM_ERROR:
	case EVENT_DOWNSTREAM_RESPONSE:
	return NotFeasible(event_args);
	}
	break;
	case STATE_BOTH_STREAMS_CONNECTED:
	switch (event) {
	case EVENT_AUDIO_CHUNK:
	return TransmitAudioUpstream(event_args);
	case EVENT_DOWNSTREAM_RESPONSE:
	return ProcessDownstreamResponse(event_args);
	case EVENT_AUDIO_CHUNKS_ENDED:
	return CloseUpstreamAndWaitForResults(event_args);
	case EVENT_END_RECOGNITION:
	return AbortSilently(event_args);
	case EVENT_UPSTREAM_ERROR:
	case EVENT_DOWNSTREAM_ERROR:
	case EVENT_DOWNSTREAM_CLOSED:
	return AbortWithError(event_args);
	case EVENT_START_RECOGNITION:
	return NotFeasible(event_args);
	}
	break;
	case STATE_WAITING_DOWNSTREAM_RESULTS:
	switch (event) {
	case EVENT_DOWNSTREAM_RESPONSE:
	return ProcessDownstreamResponse(event_args);
	case EVENT_DOWNSTREAM_CLOSED:
	return RaiseNoMatchErrorIfGotNoResults(event_args);
	case EVENT_END_RECOGNITION:
	return AbortSilently(event_args);
	case EVENT_UPSTREAM_ERROR:
	case EVENT_DOWNSTREAM_ERROR:
	return AbortWithError(event_args);
	case EVENT_START_RECOGNITION:
	case EVENT_AUDIO_CHUNK:
	case EVENT_AUDIO_CHUNKS_ENDED:
	return NotFeasible(event_args);
	}
	break;
	}
	return NotFeasible(event_args);
	}

	// ----------- Contract for all the FSM evolution functions below -------------
	// - Are guaranteed to be executed in the same thread (IO, except for tests);
	// - Are guaranteed to be not reentrant (themselves and each other);
	// - event_args members are guaranteed to be stable during the call;

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
	DCHECK(!upstream_fetcher_.get());
	DCHECK(!downstream_fetcher_.get());

	encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
	config_.audio_sample_rate,
	config_.audio_num_bits_per_sample));
	DCHECK(encoder_.get());
	const std::string request_key = GenerateRequestKey();

	// Setup downstream fetcher.
	std::vector<std::string> downstream_args;
	downstream_args.push_back(
	"key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
	downstream_args.push_back("pair=" + request_key);
	downstream_args.push_back("output=pb");
	GURL downstream_url(std::string(kWebServiceBaseUrl) +
	std::string(kDownstreamUrl) +
	JoinString(downstream_args, '&'));

	downstream_fetcher_.reset(URLFetcher::Create(
	kDownstreamUrlFetcherIdForTesting, downstream_url, URLFetcher::GET,
	this));
	downstream_fetcher_->SetRequestContext(url_context_.get());
	downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES \|
	net::LOAD_DO_NOT_SEND_COOKIES \|
	net::LOAD_DO_NOT_SEND_AUTH_DATA);
	downstream_fetcher_->Start();

	// Setup upstream fetcher.
	// TODO(hans): Support for user-selected grammars.
	std::vector<std::string> upstream_args;
	upstream_args.push_back("key=" +
	net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
	upstream_args.push_back("pair=" + request_key);
	upstream_args.push_back("output=pb");
	upstream_args.push_back(
	"lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true));
	upstream_args.push_back(
	config_.filter_profanities ? "pFilter=2" : "pFilter=0");
	if (config_.max_hypotheses > 0U) {
	int max_alternatives = std::min(kMaxMaxAlternatives,
	config_.max_hypotheses);
	upstream_args.push_back("maxAlternatives=" +
	base::UintToString(max_alternatives));
	}
	upstream_args.push_back("client=chromium");
	if (!config_.hardware_info.empty()) {
	upstream_args.push_back(
	"xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true));
	}
	if (config_.continuous)
	upstream_args.push_back("continuous");
	if (config_.interim_results)
	upstream_args.push_back("interim");

	GURL upstream_url(std::string(kWebServiceBaseUrl) +
	std::string(kUpstreamUrl) +
	JoinString(upstream_args, '&'));

	upstream_fetcher_.reset(URLFetcher::Create(
	kUpstreamUrlFetcherIdForTesting, upstream_url, URLFetcher::POST, this));
	upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
	upstream_fetcher_->SetRequestContext(url_context_.get());
	upstream_fetcher_->SetReferrer(config_.origin_url);
	upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES \|
	net::LOAD_DO_NOT_SEND_COOKIES \|
	net::LOAD_DO_NOT_SEND_AUTH_DATA);
	upstream_fetcher_->Start();
	previous_response_length_ = 0;
	return STATE_BOTH_STREAMS_CONNECTED;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::TransmitAudioUpstream(
	const FSMEventArgs& event_args) {
	DCHECK(upstream_fetcher_.get());
	DCHECK(event_args.audio_data.get());
	const AudioChunk& audio = *(event_args.audio_data.get());

	DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
	encoder_->Encode(audio);
	scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
	upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
	return state_;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::ProcessDownstreamResponse(
	const FSMEventArgs& event_args) {
	DCHECK(event_args.response.get());

	proto::SpeechRecognitionEvent ws_event;
	if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
	event_args.response->end())))
	return AbortWithError(event_args);

	// An empty (default) event is used to notify us that the upstream has
	// been connected. Ignore.
	if (!ws_event.result_size() && (!ws_event.has_status() \|\|
	ws_event.status() == proto::SpeechRecognitionEvent::STATUS_SUCCESS)) {
	DVLOG(1) << "Received empty response";
	return state_;
	}

	if (ws_event.has_status()) {
	switch (ws_event.status()) {
	case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
	break;
	case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
	return Abort(SPEECH_RECOGNITION_ERROR_NO_SPEECH);
	case proto::SpeechRecognitionEvent::STATUS_ABORTED:
	return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
	case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
	return Abort(SPEECH_RECOGNITION_ERROR_AUDIO);
	case proto::SpeechRecognitionEvent::STATUS_NETWORK:
	return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
	case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
	// TODO(hans): We need a better error code for this.
	return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
	case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
	// TODO(hans): We need a better error code for this.
	return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
	case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
	return Abort(SPEECH_RECOGNITION_ERROR_BAD_GRAMMAR);
	case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
	// TODO(hans): We need a better error code for this.
	return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
	}
	}

	SpeechRecognitionResults results;
	for (int i = 0; i < ws_event.result_size(); ++i) {
	const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
	results.push_back(SpeechRecognitionResult());
	SpeechRecognitionResult& result = results.back();
	result.is_provisional = !(ws_result.has_final() && ws_result.final());

	if (!result.is_provisional)
	got_last_definitive_result_ = true;

	for (int j = 0; j < ws_result.alternative_size(); ++j) {
	const proto::SpeechRecognitionAlternative& ws_alternative =
	ws_result.alternative(j);
	SpeechRecognitionHypothesis hypothesis;
	if (ws_alternative.has_confidence())
	hypothesis.confidence = ws_alternative.confidence();
	else if (ws_result.has_stability())
	hypothesis.confidence = ws_result.stability();
	DCHECK(ws_alternative.has_transcript());
	// TODO(hans): Perhaps the transcript should be required in the proto?
	if (ws_alternative.has_transcript())
	hypothesis.utterance = base::UTF8ToUTF16(ws_alternative.transcript());

	result.hypotheses.push_back(hypothesis);
	}
	}

	delegate()->OnSpeechRecognitionEngineResults(results);

	return state_;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults(
	const FSMEventArgs& event_args) {
	if (!got_last_definitive_result_) {
	// Provide an empty result to notify that recognition is ended with no
	// errors, yet neither any further results.
	delegate()->OnSpeechRecognitionEngineResults(SpeechRecognitionResults());
	}
	return AbortSilently(event_args);
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults(
	const FSMEventArgs&) {
	DCHECK(upstream_fetcher_.get());
	DCHECK(encoder_.get());

	DVLOG(1) << "Closing upstream.";

	// The encoder requires a non-empty final buffer. So we encode a packet
	// of silence in case encoder had no data already.
	std::vector<short> samples(
	config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
	scoped_refptr<AudioChunk> dummy_chunk =
	new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
	samples.size() * sizeof(short),
	encoder_->bits_per_sample() / 8);
	encoder_->Encode(*dummy_chunk.get());
	encoder_->Flush();
	scoped_refptr<AudioChunk> encoded_dummy_data =
	encoder_->GetEncodedDataAndClear();
	DCHECK(!encoded_dummy_data->IsEmpty());
	encoder_.reset();

	upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
	got_last_definitive_result_ = false;
	return STATE_WAITING_DOWNSTREAM_RESULTS;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) {
	DCHECK(!upstream_fetcher_.get());
	DCHECK(downstream_fetcher_.get());

	DVLOG(1) << "Closing downstream.";
	downstream_fetcher_.reset();
	return STATE_IDLE;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) {
	return Abort(SPEECH_RECOGNITION_ERROR_NONE);
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) {
	return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
	}

	GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort(
	SpeechRecognitionErrorCode error_code) {
	DVLOG(1) << "Aborting with error " << error_code;

	if (error_code != SPEECH_RECOGNITION_ERROR_NONE) {
	delegate()->OnSpeechRecognitionEngineError(
	SpeechRecognitionError(error_code));
	}
	downstream_fetcher_.reset();
	upstream_fetcher_.reset();
	encoder_.reset();
	return STATE_IDLE;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) {
	return state_;
	}

	GoogleStreamingRemoteEngine::FSMState
	GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) {
	NOTREACHED() << "Unfeasible event " << event_args.event
	<< " in state " << state_;
	return state_;
	}

	std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const {
	std::string langs = config_.language;
	if (langs.empty() && url_context_.get()) {
	// If no language is provided then we use the first from the accepted
	// language list. If this list is empty then it defaults to "en-US".
	// Example of the contents of this list: "es,en-GB;q=0.8", ""
	net::URLRequestContext* request_context =
	url_context_->GetURLRequestContext();
	DCHECK(request_context);
	// TODO(pauljensen): GoogleStreamingRemoteEngine should be constructed with
	// a reference to the HttpUserAgentSettings rather than accessing the
	// accept language through the URLRequestContext.
	if (request_context->http_user_agent_settings()) {
	std::string accepted_language_list =
	request_context->http_user_agent_settings()->GetAcceptLanguage();
	size_t separator = accepted_language_list.find_first_of(",;");
	if (separator != std::string::npos)
	langs = accepted_language_list.substr(0, separator);
	}
	}
	if (langs.empty())
	langs = "en-US";
	return langs;
	}

	// TODO(primiano): Is there any utility in the codebase that already does this?
	std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const {
	const int64 kKeepLowBytes = 0x00000000FFFFFFFFLL;
	const int64 kKeepHighBytes = 0xFFFFFFFF00000000LL;

	// Just keep the least significant bits of timestamp, in order to reduce
	// probability of collisions.
	int64 key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) \|
	(base::RandUint64() & kKeepHighBytes);
	return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
	}

	GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
	: event(event_value) {
	}

	GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() {
	}

	} // namespace content