blob: f9e2e6ea04dd9864e7eec47366d4d17d50b1944c [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/google_streaming_remote_engine.h"
#include <vector>
#include "base/bind.h"
#include "base/rand_util.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/time/time.h"
#include "content/browser/speech/audio_buffer.h"
#include "content/browser/speech/proto/google_streaming_api.pb.h"
#include "content/public/common/speech_recognition_error.h"
#include "content/public/common/speech_recognition_result.h"
#include "google_apis/google_api_keys.h"
#include "net/base/escape.h"
#include "net/base/load_flags.h"
#include "net/url_request/http_user_agent_settings.h"
#include "net/url_request/url_fetcher.h"
#include "net/url_request/url_request_context.h"
#include "net/url_request/url_request_context_getter.h"
#include "net/url_request/url_request_status.h"
using net::URLFetcher;
namespace content {
namespace {
const char kWebServiceBaseUrl[] =
"https://www.google.com/speech-api/full-duplex/v1";
const char kDownstreamUrl[] = "/down?";
const char kUpstreamUrl[] = "/up?";
const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
// This matches the maximum maxAlternatives value supported by the server.
const uint32 kMaxMaxAlternatives = 30;
// TODO(hans): Remove this and other logging when we don't need it anymore.
void DumpResponse(const std::string& response) {
DVLOG(1) << "------------";
proto::SpeechRecognitionEvent event;
if (!event.ParseFromString(response)) {
DVLOG(1) << "Parse failed!";
return;
}
if (event.has_status())
DVLOG(1) << "STATUS\t" << event.status();
for (int i = 0; i < event.result_size(); ++i) {
DVLOG(1) << "RESULT #" << i << ":";
const proto::SpeechRecognitionResult& res = event.result(i);
if (res.has_final())
DVLOG(1) << " FINAL:\t" << res.final();
if (res.has_stability())
DVLOG(1) << " STABILITY:\t" << res.stability();
for (int j = 0; j < res.alternative_size(); ++j) {
const proto::SpeechRecognitionAlternative& alt =
res.alternative(j);
if (alt.has_confidence())
DVLOG(1) << " CONFIDENCE:\t" << alt.confidence();
if (alt.has_transcript())
DVLOG(1) << " TRANSCRIPT:\t" << alt.transcript();
}
}
}
} // namespace
const int GoogleStreamingRemoteEngine::kAudioPacketIntervalMs = 100;
const int GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting = 0;
const int GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting = 1;
const int GoogleStreamingRemoteEngine::kWebserviceStatusNoError = 0;
const int GoogleStreamingRemoteEngine::kWebserviceStatusErrorNoMatch = 5;
GoogleStreamingRemoteEngine::GoogleStreamingRemoteEngine(
net::URLRequestContextGetter* context)
: url_context_(context),
previous_response_length_(0),
got_last_definitive_result_(false),
is_dispatching_event_(false),
state_(STATE_IDLE) {}
GoogleStreamingRemoteEngine::~GoogleStreamingRemoteEngine() {}
void GoogleStreamingRemoteEngine::SetConfig(
const SpeechRecognitionEngineConfig& config) {
config_ = config;
}
void GoogleStreamingRemoteEngine::StartRecognition() {
FSMEventArgs event_args(EVENT_START_RECOGNITION);
DispatchEvent(event_args);
}
void GoogleStreamingRemoteEngine::EndRecognition() {
FSMEventArgs event_args(EVENT_END_RECOGNITION);
DispatchEvent(event_args);
}
void GoogleStreamingRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
FSMEventArgs event_args(EVENT_AUDIO_CHUNK);
event_args.audio_data = &data;
DispatchEvent(event_args);
}
void GoogleStreamingRemoteEngine::AudioChunksEnded() {
FSMEventArgs event_args(EVENT_AUDIO_CHUNKS_ENDED);
DispatchEvent(event_args);
}
void GoogleStreamingRemoteEngine::OnURLFetchComplete(const URLFetcher* source) {
const bool kResponseComplete = true;
DispatchHTTPResponse(source, kResponseComplete);
}
void GoogleStreamingRemoteEngine::OnURLFetchDownloadProgress(
const URLFetcher* source, int64 current, int64 total) {
const bool kPartialResponse = false;
DispatchHTTPResponse(source, kPartialResponse);
}
void GoogleStreamingRemoteEngine::DispatchHTTPResponse(const URLFetcher* source,
bool end_of_response) {
DCHECK(CalledOnValidThread());
DCHECK(source);
const bool response_is_good = source->GetStatus().is_success() &&
source->GetResponseCode() == 200;
std::string response;
if (response_is_good)
source->GetResponseAsString(&response);
const size_t current_response_length = response.size();
DVLOG(1) << (source == downstream_fetcher_.get() ? "Downstream" : "Upstream")
<< "HTTP, code: " << source->GetResponseCode()
<< " length: " << current_response_length
<< " eor: " << end_of_response;
// URLFetcher provides always the entire response buffer, but we are only
// interested in the fresh data introduced by the last chunk. Therefore, we
// drop the previous content we have already processed.
if (current_response_length != 0) {
DCHECK_GE(current_response_length, previous_response_length_);
response.erase(0, previous_response_length_);
previous_response_length_ = current_response_length;
}
if (!response_is_good && source == downstream_fetcher_.get()) {
DVLOG(1) << "Downstream error " << source->GetResponseCode();
FSMEventArgs event_args(EVENT_DOWNSTREAM_ERROR);
DispatchEvent(event_args);
return;
}
if (!response_is_good && source == upstream_fetcher_.get()) {
DVLOG(1) << "Upstream error " << source->GetResponseCode()
<< " EOR " << end_of_response;
FSMEventArgs event_args(EVENT_UPSTREAM_ERROR);
DispatchEvent(event_args);
return;
}
// Ignore incoming data on the upstream connection.
if (source == upstream_fetcher_.get())
return;
DCHECK(response_is_good && source == downstream_fetcher_.get());
// The downstream response is organized in chunks, whose size is determined
// by a 4 bytes prefix, transparently handled by the ChunkedByteBuffer class.
// Such chunks are sent by the speech recognition webservice over the HTTP
// downstream channel using HTTP chunked transfer (unrelated to our chunks).
// This function is called every time an HTTP chunk is received by the
// url fetcher. However there isn't any particular matching beween our
// protocol chunks and HTTP chunks, in the sense that a single HTTP chunk can
// contain a portion of one chunk or even more chunks together.
chunked_byte_buffer_.Append(response);
// A single HTTP chunk can contain more than one data chunk, thus the while.
while (chunked_byte_buffer_.HasChunks()) {
FSMEventArgs event_args(EVENT_DOWNSTREAM_RESPONSE);
event_args.response = chunked_byte_buffer_.PopChunk();
DCHECK(event_args.response.get());
DumpResponse(std::string(event_args.response->begin(),
event_args.response->end()));
DispatchEvent(event_args);
}
if (end_of_response) {
FSMEventArgs event_args(EVENT_DOWNSTREAM_CLOSED);
DispatchEvent(event_args);
}
}
bool GoogleStreamingRemoteEngine::IsRecognitionPending() const {
DCHECK(CalledOnValidThread());
return state_ != STATE_IDLE;
}
int GoogleStreamingRemoteEngine::GetDesiredAudioChunkDurationMs() const {
return kAudioPacketIntervalMs;
}
// ----------------------- Core FSM implementation ---------------------------
void GoogleStreamingRemoteEngine::DispatchEvent(
const FSMEventArgs& event_args) {
DCHECK(CalledOnValidThread());
DCHECK_LE(event_args.event, EVENT_MAX_VALUE);
DCHECK_LE(state_, STATE_MAX_VALUE);
// Event dispatching must be sequential, otherwise it will break all the rules
// and the assumptions of the finite state automata model.
DCHECK(!is_dispatching_event_);
is_dispatching_event_ = true;
state_ = ExecuteTransitionAndGetNextState(event_args);
is_dispatching_event_ = false;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::ExecuteTransitionAndGetNextState(
const FSMEventArgs& event_args) {
const FSMEvent event = event_args.event;
switch (state_) {
case STATE_IDLE:
switch (event) {
case EVENT_START_RECOGNITION:
return ConnectBothStreams(event_args);
case EVENT_END_RECOGNITION:
// Note AUDIO_CHUNK and AUDIO_END events can remain enqueued in case of
// abort, so we just silently drop them here.
case EVENT_AUDIO_CHUNK:
case EVENT_AUDIO_CHUNKS_ENDED:
// DOWNSTREAM_CLOSED can be received if we end up here due to an error.
case EVENT_DOWNSTREAM_CLOSED:
return DoNothing(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
case EVENT_DOWNSTREAM_RESPONSE:
return NotFeasible(event_args);
}
break;
case STATE_BOTH_STREAMS_CONNECTED:
switch (event) {
case EVENT_AUDIO_CHUNK:
return TransmitAudioUpstream(event_args);
case EVENT_DOWNSTREAM_RESPONSE:
return ProcessDownstreamResponse(event_args);
case EVENT_AUDIO_CHUNKS_ENDED:
return CloseUpstreamAndWaitForResults(event_args);
case EVENT_END_RECOGNITION:
return AbortSilently(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
case EVENT_DOWNSTREAM_CLOSED:
return AbortWithError(event_args);
case EVENT_START_RECOGNITION:
return NotFeasible(event_args);
}
break;
case STATE_WAITING_DOWNSTREAM_RESULTS:
switch (event) {
case EVENT_DOWNSTREAM_RESPONSE:
return ProcessDownstreamResponse(event_args);
case EVENT_DOWNSTREAM_CLOSED:
return RaiseNoMatchErrorIfGotNoResults(event_args);
case EVENT_END_RECOGNITION:
return AbortSilently(event_args);
case EVENT_UPSTREAM_ERROR:
case EVENT_DOWNSTREAM_ERROR:
return AbortWithError(event_args);
case EVENT_START_RECOGNITION:
case EVENT_AUDIO_CHUNK:
case EVENT_AUDIO_CHUNKS_ENDED:
return NotFeasible(event_args);
}
break;
}
return NotFeasible(event_args);
}
// ----------- Contract for all the FSM evolution functions below -------------
// - Are guaranteed to be executed in the same thread (IO, except for tests);
// - Are guaranteed to be not reentrant (themselves and each other);
// - event_args members are guaranteed to be stable during the call;
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::ConnectBothStreams(const FSMEventArgs&) {
DCHECK(!upstream_fetcher_.get());
DCHECK(!downstream_fetcher_.get());
encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
config_.audio_sample_rate,
config_.audio_num_bits_per_sample));
DCHECK(encoder_.get());
const std::string request_key = GenerateRequestKey();
// Setup downstream fetcher.
std::vector<std::string> downstream_args;
downstream_args.push_back(
"key=" + net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
downstream_args.push_back("pair=" + request_key);
downstream_args.push_back("output=pb");
GURL downstream_url(std::string(kWebServiceBaseUrl) +
std::string(kDownstreamUrl) +
JoinString(downstream_args, '&'));
downstream_fetcher_.reset(URLFetcher::Create(
kDownstreamUrlFetcherIdForTesting, downstream_url, URLFetcher::GET,
this));
downstream_fetcher_->SetRequestContext(url_context_.get());
downstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
net::LOAD_DO_NOT_SEND_COOKIES |
net::LOAD_DO_NOT_SEND_AUTH_DATA);
downstream_fetcher_->Start();
// Setup upstream fetcher.
// TODO(hans): Support for user-selected grammars.
std::vector<std::string> upstream_args;
upstream_args.push_back("key=" +
net::EscapeQueryParamValue(google_apis::GetAPIKey(), true));
upstream_args.push_back("pair=" + request_key);
upstream_args.push_back("output=pb");
upstream_args.push_back(
"lang=" + net::EscapeQueryParamValue(GetAcceptedLanguages(), true));
upstream_args.push_back(
config_.filter_profanities ? "pFilter=2" : "pFilter=0");
if (config_.max_hypotheses > 0U) {
int max_alternatives = std::min(kMaxMaxAlternatives,
config_.max_hypotheses);
upstream_args.push_back("maxAlternatives=" +
base::UintToString(max_alternatives));
}
upstream_args.push_back("client=chromium");
if (!config_.hardware_info.empty()) {
upstream_args.push_back(
"xhw=" + net::EscapeQueryParamValue(config_.hardware_info, true));
}
if (config_.continuous)
upstream_args.push_back("continuous");
if (config_.interim_results)
upstream_args.push_back("interim");
GURL upstream_url(std::string(kWebServiceBaseUrl) +
std::string(kUpstreamUrl) +
JoinString(upstream_args, '&'));
upstream_fetcher_.reset(URLFetcher::Create(
kUpstreamUrlFetcherIdForTesting, upstream_url, URLFetcher::POST, this));
upstream_fetcher_->SetChunkedUpload(encoder_->mime_type());
upstream_fetcher_->SetRequestContext(url_context_.get());
upstream_fetcher_->SetReferrer(config_.origin_url);
upstream_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
net::LOAD_DO_NOT_SEND_COOKIES |
net::LOAD_DO_NOT_SEND_AUTH_DATA);
upstream_fetcher_->Start();
previous_response_length_ = 0;
return STATE_BOTH_STREAMS_CONNECTED;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::TransmitAudioUpstream(
const FSMEventArgs& event_args) {
DCHECK(upstream_fetcher_.get());
DCHECK(event_args.audio_data.get());
const AudioChunk& audio = *(event_args.audio_data.get());
DCHECK_EQ(audio.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
encoder_->Encode(audio);
scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
upstream_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
return state_;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::ProcessDownstreamResponse(
const FSMEventArgs& event_args) {
DCHECK(event_args.response.get());
proto::SpeechRecognitionEvent ws_event;
if (!ws_event.ParseFromString(std::string(event_args.response->begin(),
event_args.response->end())))
return AbortWithError(event_args);
// An empty (default) event is used to notify us that the upstream has
// been connected. Ignore.
if (!ws_event.result_size() && (!ws_event.has_status() ||
ws_event.status() == proto::SpeechRecognitionEvent::STATUS_SUCCESS)) {
DVLOG(1) << "Received empty response";
return state_;
}
if (ws_event.has_status()) {
switch (ws_event.status()) {
case proto::SpeechRecognitionEvent::STATUS_SUCCESS:
break;
case proto::SpeechRecognitionEvent::STATUS_NO_SPEECH:
return Abort(SPEECH_RECOGNITION_ERROR_NO_SPEECH);
case proto::SpeechRecognitionEvent::STATUS_ABORTED:
return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
case proto::SpeechRecognitionEvent::STATUS_AUDIO_CAPTURE:
return Abort(SPEECH_RECOGNITION_ERROR_AUDIO);
case proto::SpeechRecognitionEvent::STATUS_NETWORK:
return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
case proto::SpeechRecognitionEvent::STATUS_NOT_ALLOWED:
// TODO(hans): We need a better error code for this.
return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
case proto::SpeechRecognitionEvent::STATUS_SERVICE_NOT_ALLOWED:
// TODO(hans): We need a better error code for this.
return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
case proto::SpeechRecognitionEvent::STATUS_BAD_GRAMMAR:
return Abort(SPEECH_RECOGNITION_ERROR_BAD_GRAMMAR);
case proto::SpeechRecognitionEvent::STATUS_LANGUAGE_NOT_SUPPORTED:
// TODO(hans): We need a better error code for this.
return Abort(SPEECH_RECOGNITION_ERROR_ABORTED);
}
}
SpeechRecognitionResults results;
for (int i = 0; i < ws_event.result_size(); ++i) {
const proto::SpeechRecognitionResult& ws_result = ws_event.result(i);
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.is_provisional = !(ws_result.has_final() && ws_result.final());
if (!result.is_provisional)
got_last_definitive_result_ = true;
for (int j = 0; j < ws_result.alternative_size(); ++j) {
const proto::SpeechRecognitionAlternative& ws_alternative =
ws_result.alternative(j);
SpeechRecognitionHypothesis hypothesis;
if (ws_alternative.has_confidence())
hypothesis.confidence = ws_alternative.confidence();
else if (ws_result.has_stability())
hypothesis.confidence = ws_result.stability();
DCHECK(ws_alternative.has_transcript());
// TODO(hans): Perhaps the transcript should be required in the proto?
if (ws_alternative.has_transcript())
hypothesis.utterance = base::UTF8ToUTF16(ws_alternative.transcript());
result.hypotheses.push_back(hypothesis);
}
}
delegate()->OnSpeechRecognitionEngineResults(results);
return state_;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::RaiseNoMatchErrorIfGotNoResults(
const FSMEventArgs& event_args) {
if (!got_last_definitive_result_) {
// Provide an empty result to notify that recognition is ended with no
// errors, yet neither any further results.
delegate()->OnSpeechRecognitionEngineResults(SpeechRecognitionResults());
}
return AbortSilently(event_args);
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::CloseUpstreamAndWaitForResults(
const FSMEventArgs&) {
DCHECK(upstream_fetcher_.get());
DCHECK(encoder_.get());
DVLOG(1) << "Closing upstream.";
// The encoder requires a non-empty final buffer. So we encode a packet
// of silence in case encoder had no data already.
std::vector<short> samples(
config_.audio_sample_rate * kAudioPacketIntervalMs / 1000);
scoped_refptr<AudioChunk> dummy_chunk =
new AudioChunk(reinterpret_cast<uint8*>(&samples[0]),
samples.size() * sizeof(short),
encoder_->bits_per_sample() / 8);
encoder_->Encode(*dummy_chunk.get());
encoder_->Flush();
scoped_refptr<AudioChunk> encoded_dummy_data =
encoder_->GetEncodedDataAndClear();
DCHECK(!encoded_dummy_data->IsEmpty());
encoder_.reset();
upstream_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
got_last_definitive_result_ = false;
return STATE_WAITING_DOWNSTREAM_RESULTS;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::CloseDownstream(const FSMEventArgs&) {
DCHECK(!upstream_fetcher_.get());
DCHECK(downstream_fetcher_.get());
DVLOG(1) << "Closing downstream.";
downstream_fetcher_.reset();
return STATE_IDLE;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::AbortSilently(const FSMEventArgs&) {
return Abort(SPEECH_RECOGNITION_ERROR_NONE);
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::AbortWithError(const FSMEventArgs&) {
return Abort(SPEECH_RECOGNITION_ERROR_NETWORK);
}
GoogleStreamingRemoteEngine::FSMState GoogleStreamingRemoteEngine::Abort(
SpeechRecognitionErrorCode error_code) {
DVLOG(1) << "Aborting with error " << error_code;
if (error_code != SPEECH_RECOGNITION_ERROR_NONE) {
delegate()->OnSpeechRecognitionEngineError(
SpeechRecognitionError(error_code));
}
downstream_fetcher_.reset();
upstream_fetcher_.reset();
encoder_.reset();
return STATE_IDLE;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::DoNothing(const FSMEventArgs&) {
return state_;
}
GoogleStreamingRemoteEngine::FSMState
GoogleStreamingRemoteEngine::NotFeasible(const FSMEventArgs& event_args) {
NOTREACHED() << "Unfeasible event " << event_args.event
<< " in state " << state_;
return state_;
}
std::string GoogleStreamingRemoteEngine::GetAcceptedLanguages() const {
std::string langs = config_.language;
if (langs.empty() && url_context_.get()) {
// If no language is provided then we use the first from the accepted
// language list. If this list is empty then it defaults to "en-US".
// Example of the contents of this list: "es,en-GB;q=0.8", ""
net::URLRequestContext* request_context =
url_context_->GetURLRequestContext();
DCHECK(request_context);
// TODO(pauljensen): GoogleStreamingRemoteEngine should be constructed with
// a reference to the HttpUserAgentSettings rather than accessing the
// accept language through the URLRequestContext.
if (request_context->http_user_agent_settings()) {
std::string accepted_language_list =
request_context->http_user_agent_settings()->GetAcceptLanguage();
size_t separator = accepted_language_list.find_first_of(",;");
if (separator != std::string::npos)
langs = accepted_language_list.substr(0, separator);
}
}
if (langs.empty())
langs = "en-US";
return langs;
}
// TODO(primiano): Is there any utility in the codebase that already does this?
std::string GoogleStreamingRemoteEngine::GenerateRequestKey() const {
const int64 kKeepLowBytes = 0x00000000FFFFFFFFLL;
const int64 kKeepHighBytes = 0xFFFFFFFF00000000LL;
// Just keep the least significant bits of timestamp, in order to reduce
// probability of collisions.
int64 key = (base::Time::Now().ToInternalValue() & kKeepLowBytes) |
(base::RandUint64() & kKeepHighBytes);
return base::HexEncode(reinterpret_cast<void*>(&key), sizeof(key));
}
GoogleStreamingRemoteEngine::FSMEventArgs::FSMEventArgs(FSMEvent event_value)
: event(event_value) {
}
GoogleStreamingRemoteEngine::FSMEventArgs::~FSMEventArgs() {
}
} // namespace content