blob: ab703d7ac04e7941d021d00e0ccc12c654ade0a0 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <queue>
#include "base/memory/scoped_ptr.h"
#include "base/message_loop/message_loop.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/utf_string_conversions.h"
#include "base/sys_byteorder.h"
#include "content/browser/speech/audio_buffer.h"
#include "content/browser/speech/google_streaming_remote_engine.h"
#include "content/browser/speech/proto/google_streaming_api.pb.h"
#include "content/public/common/speech_recognition_error.h"
#include "content/public/common/speech_recognition_result.h"
#include "net/url_request/test_url_fetcher_factory.h"
#include "net/url_request/url_request_context_getter.h"
#include "net/url_request/url_request_status.h"
#include "testing/gtest/include/gtest/gtest.h"
using base::HostToNet32;
using base::checked_cast;
using net::URLRequestStatus;
using net::TestURLFetcher;
using net::TestURLFetcherFactory;
namespace content {
// Note: the terms upstream and downstream are from the point-of-view of the
// client (engine_under_test_).
class GoogleStreamingRemoteEngineTest : public SpeechRecognitionEngineDelegate,
public testing::Test {
public:
GoogleStreamingRemoteEngineTest()
: last_number_of_upstream_chunks_seen_(0U),
error_(SPEECH_RECOGNITION_ERROR_NONE) { }
// Creates a speech recognition request and invokes its URL fetcher delegate
// with the given test data.
void CreateAndTestRequest(bool success, const std::string& http_response);
// SpeechRecognitionRequestDelegate methods.
virtual void OnSpeechRecognitionEngineResults(
const SpeechRecognitionResults& results) OVERRIDE {
results_.push(results);
}
virtual void OnSpeechRecognitionEngineError(
const SpeechRecognitionError& error) OVERRIDE {
error_ = error.code;
}
// testing::Test methods.
virtual void SetUp() OVERRIDE;
virtual void TearDown() OVERRIDE;
protected:
enum DownstreamError {
DOWNSTREAM_ERROR_NONE,
DOWNSTREAM_ERROR_HTTP500,
DOWNSTREAM_ERROR_NETWORK,
DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
};
static bool ResultsAreEqual(const SpeechRecognitionResults& a,
const SpeechRecognitionResults& b);
static std::string SerializeProtobufResponse(
const proto::SpeechRecognitionEvent& msg);
TestURLFetcher* GetUpstreamFetcher();
TestURLFetcher* GetDownstreamFetcher();
void StartMockRecognition();
void EndMockRecognition();
void InjectDummyAudioChunk();
size_t UpstreamChunksUploadedFromLastCall();
void ProvideMockProtoResultDownstream(
const proto::SpeechRecognitionEvent& result);
void ProvideMockResultDownstream(const SpeechRecognitionResult& result);
void ExpectResultsReceived(const SpeechRecognitionResults& result);
void CloseMockDownstream(DownstreamError error);
scoped_ptr<GoogleStreamingRemoteEngine> engine_under_test_;
TestURLFetcherFactory url_fetcher_factory_;
size_t last_number_of_upstream_chunks_seen_;
base::MessageLoop message_loop_;
std::string response_buffer_;
SpeechRecognitionErrorCode error_;
std::queue<SpeechRecognitionResults> results_;
};
TEST_F(GoogleStreamingRemoteEngineTest, SingleDefinitiveResult) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
// Inject some dummy audio chunks and check a corresponding chunked upload
// is performed every time on the server.
for (int i = 0; i < 3; ++i) {
InjectDummyAudioChunk();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
}
// Ensure that a final (empty) audio chunk is uploaded on chunks end.
engine_under_test_->AudioChunksEnded();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a protobuf message streamed from the server containing a single
// result with two hypotheses.
SpeechRecognitionResults results;
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.is_provisional = false;
result.hypotheses.push_back(
SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 1"), 0.1F));
result.hypotheses.push_back(
SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis 2"), 0.2F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Ensure everything is closed cleanly after the downstream is closed.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(GoogleStreamingRemoteEngineTest, SeveralStreamingResults) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
for (int i = 0; i < 4; ++i) {
InjectDummyAudioChunk();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
SpeechRecognitionResults results;
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.is_provisional = (i % 2 == 0); // Alternate result types.
float confidence = result.is_provisional ? 0.0F : (i * 0.1F);
result.hypotheses.push_back(SpeechRecognitionHypothesis(
base::UTF8ToUTF16("hypothesis"), confidence));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
}
// Ensure that a final (empty) audio chunk is uploaded on chunks end.
engine_under_test_->AudioChunksEnded();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a final definitive result.
SpeechRecognitionResults results;
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.is_provisional = false;
result.hypotheses.push_back(
SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 1.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Ensure everything is closed cleanly after the downstream is closed.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(GoogleStreamingRemoteEngineTest, NoFinalResultAfterAudioChunksEnded) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
// Simulate one pushed audio chunk.
InjectDummyAudioChunk();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
// Simulate the corresponding definitive result.
SpeechRecognitionResults results;
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.hypotheses.push_back(
SpeechRecognitionHypothesis(base::UTF8ToUTF16("hypothesis"), 1.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a silent downstream closure after |AudioChunksEnded|.
engine_under_test_->AudioChunksEnded();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
// Expect an empty result, aimed at notifying recognition ended with no
// actual results nor errors.
SpeechRecognitionResults empty_results;
ExpectResultsReceived(empty_results);
// Ensure everything is closed cleanly after the downstream is closed.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(GoogleStreamingRemoteEngineTest, NoMatchError) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
for (int i = 0; i < 3; ++i)
InjectDummyAudioChunk();
engine_under_test_->AudioChunksEnded();
ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate only a provisional result.
SpeechRecognitionResults results;
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.is_provisional = true;
result.hypotheses.push_back(
SpeechRecognitionHypothesis(base::UTF8ToUTF16("The final result"), 0.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH);
// Expect an empty result.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
SpeechRecognitionResults empty_result;
ExpectResultsReceived(empty_result);
}
TEST_F(GoogleStreamingRemoteEngineTest, HTTPError) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
InjectDummyAudioChunk();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
// Close the downstream with a HTTP 500 error.
CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500);
// Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(GoogleStreamingRemoteEngineTest, NetworkError) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
InjectDummyAudioChunk();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
// Close the downstream fetcher simulating a network failure.
CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK);
// Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(GoogleStreamingRemoteEngineTest, Stability) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamFetcher());
ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
// Upload a dummy audio chunk.
InjectDummyAudioChunk();
ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
engine_under_test_->AudioChunksEnded();
// Simulate a protobuf message with an intermediate result without confidence,
// but with stability.
proto::SpeechRecognitionEvent proto_event;
proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
proto_result->set_stability(0.5);
proto::SpeechRecognitionAlternative *proto_alternative =
proto_result->add_alternative();
proto_alternative->set_transcript("foo");
ProvideMockProtoResultDownstream(proto_event);
// Set up expectations.
SpeechRecognitionResults results;
results.push_back(SpeechRecognitionResult());
SpeechRecognitionResult& result = results.back();
result.is_provisional = true;
result.hypotheses.push_back(
SpeechRecognitionHypothesis(base::UTF8ToUTF16("foo"), 0.5));
// Check that the protobuf generated the expected result.
ExpectResultsReceived(results);
// Since it was a provisional result, recognition is still pending.
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Shut down.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
// Since there was no final result, we get an empty "no match" result.
SpeechRecognitionResults empty_result;
ExpectResultsReceived(empty_result);
ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE, error_);
ASSERT_EQ(0U, results_.size());
}
void GoogleStreamingRemoteEngineTest::SetUp() {
engine_under_test_.reset(
new GoogleStreamingRemoteEngine(NULL /*URLRequestContextGetter*/));
engine_under_test_->set_delegate(this);
}
void GoogleStreamingRemoteEngineTest::TearDown() {
engine_under_test_.reset();
}
TestURLFetcher* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() {
return url_fetcher_factory_.GetFetcherByID(
GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTesting);
}
TestURLFetcher* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() {
return url_fetcher_factory_.GetFetcherByID(
GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTesting);
}
// Starts recognition on the engine, ensuring that both stream fetchers are
// created.
void GoogleStreamingRemoteEngineTest::StartMockRecognition() {
DCHECK(engine_under_test_.get());
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
engine_under_test_->StartRecognition();
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
ASSERT_TRUE(upstream_fetcher);
upstream_fetcher->set_url(upstream_fetcher->GetOriginalURL());
TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
ASSERT_TRUE(downstream_fetcher);
downstream_fetcher->set_url(downstream_fetcher->GetOriginalURL());
}
void GoogleStreamingRemoteEngineTest::EndMockRecognition() {
DCHECK(engine_under_test_.get());
engine_under_test_->EndRecognition();
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
// TODO(primiano): In order to be very pedantic we should check that both the
// upstream and downstream URL fetchers have been disposed at this time.
// Unfortunately it seems that there is no direct way to detect (in tests)
// if a url_fetcher has been freed or not, since they are not automatically
// de-registered from the TestURLFetcherFactory on destruction.
}
void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() {
unsigned char dummy_audio_buffer_data[2] = {'\0', '\0'};
scoped_refptr<AudioChunk> dummy_audio_chunk(
new AudioChunk(&dummy_audio_buffer_data[0],
sizeof(dummy_audio_buffer_data),
2 /* bytes per sample */));
DCHECK(engine_under_test_.get());
engine_under_test_->TakeAudioChunk(*dummy_audio_chunk.get());
}
size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
TestURLFetcher* upstream_fetcher = GetUpstreamFetcher();
DCHECK(upstream_fetcher);
const size_t number_of_chunks = upstream_fetcher->upload_chunks().size();
DCHECK_GE(number_of_chunks, last_number_of_upstream_chunks_seen_);
const size_t new_chunks = number_of_chunks -
last_number_of_upstream_chunks_seen_;
last_number_of_upstream_chunks_seen_ = number_of_chunks;
return new_chunks;
}
void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream(
const proto::SpeechRecognitionEvent& result) {
TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
ASSERT_TRUE(downstream_fetcher);
downstream_fetcher->set_status(URLRequestStatus(/* default=SUCCESS */));
downstream_fetcher->set_response_code(200);
std::string response_string = SerializeProtobufResponse(result);
response_buffer_.append(response_string);
downstream_fetcher->SetResponseString(response_buffer_);
downstream_fetcher->delegate()->OnURLFetchDownloadProgress(
downstream_fetcher,
response_buffer_.size(),
-1 /* total response length not used */);
}
void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream(
const SpeechRecognitionResult& result) {
proto::SpeechRecognitionEvent proto_event;
proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
proto_result->set_final(!result.is_provisional);
for (size_t i = 0; i < result.hypotheses.size(); ++i) {
proto::SpeechRecognitionAlternative* proto_alternative =
proto_result->add_alternative();
const SpeechRecognitionHypothesis& hypothesis = result.hypotheses[i];
proto_alternative->set_confidence(hypothesis.confidence);
proto_alternative->set_transcript(base::UTF16ToUTF8(hypothesis.utterance));
}
ProvideMockProtoResultDownstream(proto_event);
}
void GoogleStreamingRemoteEngineTest::CloseMockDownstream(
DownstreamError error) {
TestURLFetcher* downstream_fetcher = GetDownstreamFetcher();
ASSERT_TRUE(downstream_fetcher);
const URLRequestStatus::Status fetcher_status =
(error == DOWNSTREAM_ERROR_NETWORK) ? URLRequestStatus::FAILED :
URLRequestStatus::SUCCESS;
downstream_fetcher->set_status(URLRequestStatus(fetcher_status, 0));
downstream_fetcher->set_response_code(
(error == DOWNSTREAM_ERROR_HTTP500) ? 500 : 200);
if (error == DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH) {
// Send empty response.
proto::SpeechRecognitionEvent response;
response_buffer_.append(SerializeProtobufResponse(response));
}
downstream_fetcher->SetResponseString(response_buffer_);
downstream_fetcher->delegate()->OnURLFetchComplete(downstream_fetcher);
}
void GoogleStreamingRemoteEngineTest::ExpectResultsReceived(
const SpeechRecognitionResults& results) {
ASSERT_GE(1U, results_.size());
ASSERT_TRUE(ResultsAreEqual(results, results_.front()));
results_.pop();
}
bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
const SpeechRecognitionResults& a, const SpeechRecognitionResults& b) {
if (a.size() != b.size())
return false;
SpeechRecognitionResults::const_iterator it_a = a.begin();
SpeechRecognitionResults::const_iterator it_b = b.begin();
for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) {
if (it_a->is_provisional != it_b->is_provisional ||
it_a->hypotheses.size() != it_b->hypotheses.size()) {
return false;
}
for (size_t i = 0; i < it_a->hypotheses.size(); ++i) {
const SpeechRecognitionHypothesis& hyp_a = it_a->hypotheses[i];
const SpeechRecognitionHypothesis& hyp_b = it_b->hypotheses[i];
if (hyp_a.utterance != hyp_b.utterance ||
hyp_a.confidence != hyp_b.confidence) {
return false;
}
}
}
return true;
}
std::string GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
const proto::SpeechRecognitionEvent& msg) {
std::string msg_string;
msg.SerializeToString(&msg_string);
// Prepend 4 byte prefix length indication to the protobuf message as
// envisaged by the google streaming recognition webservice protocol.
uint32 prefix = HostToNet32(checked_cast<uint32>(msg_string.size()));
msg_string.insert(0, reinterpret_cast<char*>(&prefix), sizeof(prefix));
return msg_string;
}
} // namespace content