smartselect/cached-features.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_
 #define LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_

 #include <memory>
 #include <vector>

 #include "base.h"
 #include "common/vector-span.h"
 #include "smartselect/types.h"

 namespace libtextclassifier {

 // Holds state for extracting features across multiple calls and reusing them.
 // Assumes that features for each Token are independent.
 class CachedFeatures {
  public:
   // Extracts the features for the given sequence of tokens.
   //  - context_size: Specifies how many tokens to the left, and how many
   //                   tokens to the right spans the context.
   //  - sparse_features, dense_features: Extracted features for each token.
   //  - feature_vector_fn: Writes features for given Token to the specified
   //                       storage.
   //                       NOTE: The function can assume that the underlying
   //                       storage is initialized to all zeros.
   //  - feature_vector_size: Size of a feature vector for one Token.
   CachedFeatures(VectorSpan<Token> tokens, int context_size,
                  const std::vector<std::vector<int>>& sparse_features,
                  const std::vector<std::vector<float>>& dense_features,
                  const std::function<bool(const std::vector<int>&,
                                           const std::vector<float>&, float*)>&
                      feature_vector_fn,
                  int feature_vector_size)
       : tokens_(tokens),
         context_size_(context_size),
         feature_vector_size_(feature_vector_size),
         remap_v0_feature_vector_(false),
         remap_v0_chargram_embedding_size_(-1) {
     Extract(sparse_features, dense_features, feature_vector_fn);
   }

   // Gets a VectorSpan with the features for given click position.
   bool Get(int click_pos, VectorSpan<float>* features,
            VectorSpan<Token>* output_tokens);

   // Turns on a compatibility mode, which re-maps the extracted features to the
   // v0 feature format (where the dense features were at the end).
   // WARNING: Internally v0_feature_storage_ is used as a backing buffer for
   // VectorSpan<float>, so the output of Extract is valid only until the next
   // call or destruction of the current CachedFeatures object.
   // TODO(zilka): Remove when we'll have retrained models.
   void SetV0FeatureMode(int chargram_embedding_size) {
     remap_v0_feature_vector_ = true;
     remap_v0_chargram_embedding_size_ = chargram_embedding_size;
     v0_feature_storage_.resize(feature_vector_size_ * (context_size_ * 2 + 1));
   }

  protected:
   // Extracts features for all tokens and stores them for later retrieval.
   void Extract(const std::vector<std::vector<int>>& sparse_features,
                const std::vector<std::vector<float>>& dense_features,
                const std::function<bool(const std::vector<int>&,
                                         const std::vector<float>&, float*)>&
                    feature_vector_fn);

   // Remaps extracted features to V0 feature format. The mapping is using
   // the v0_feature_storage_ as the backing storage for the mapped features.
   // For each token the features consist of:
   //  - chargram embeddings
   //  - dense features
   // They are concatenated together as [chargram embeddings; dense features]
   // for each token independently.
   // The V0 features require that the chargram embeddings for tokens are
   // concatenated first together, and at the end, the dense features for the
   // tokens are concatenated to it.
   void RemapV0FeatureVector(VectorSpan<float>* features);

  private:
   const VectorSpan<Token> tokens_;
   const int context_size_;
   const int feature_vector_size_;
   bool remap_v0_feature_vector_;
   int remap_v0_chargram_embedding_size_;

   std::vector<float> features_;
   std::vector<float> v0_feature_storage_;
 };

 }  // namespace libtextclassifier

 #endif  // LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_
	/*
	* Copyright (C) 2017 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_
	#define LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_

	#include <memory>
	#include <vector>

	#include "base.h"
	#include "common/vector-span.h"
	#include "smartselect/types.h"

	namespace libtextclassifier {

	// Holds state for extracting features across multiple calls and reusing them.
	// Assumes that features for each Token are independent.
	class CachedFeatures {
	public:
	// Extracts the features for the given sequence of tokens.
	// - context_size: Specifies how many tokens to the left, and how many
	// tokens to the right spans the context.
	// - sparse_features, dense_features: Extracted features for each token.
	// - feature_vector_fn: Writes features for given Token to the specified
	// storage.
	// NOTE: The function can assume that the underlying
	// storage is initialized to all zeros.
	// - feature_vector_size: Size of a feature vector for one Token.
	CachedFeatures(VectorSpan<Token> tokens, int context_size,
	const std::vector<std::vector<int>>& sparse_features,
	const std::vector<std::vector<float>>& dense_features,
	const std::function<bool(const std::vector<int>&,
	const std::vector<float>&, float*)>&
	feature_vector_fn,
	int feature_vector_size)
	: tokens_(tokens),
	context_size_(context_size),
	feature_vector_size_(feature_vector_size),
	remap_v0_feature_vector_(false),
	remap_v0_chargram_embedding_size_(-1) {
	Extract(sparse_features, dense_features, feature_vector_fn);
	}

	// Gets a VectorSpan with the features for given click position.
	bool Get(int click_pos, VectorSpan<float>* features,
	VectorSpan<Token>* output_tokens);

	// Turns on a compatibility mode, which re-maps the extracted features to the
	// v0 feature format (where the dense features were at the end).
	// WARNING: Internally v0_feature_storage_ is used as a backing buffer for
	// VectorSpan<float>, so the output of Extract is valid only until the next
	// call or destruction of the current CachedFeatures object.
	// TODO(zilka): Remove when we'll have retrained models.
	void SetV0FeatureMode(int chargram_embedding_size) {
	remap_v0_feature_vector_ = true;
	remap_v0_chargram_embedding_size_ = chargram_embedding_size;
	v0_feature_storage_.resize(feature_vector_size_ * (context_size_ * 2 + 1));
	}

	protected:
	// Extracts features for all tokens and stores them for later retrieval.
	void Extract(const std::vector<std::vector<int>>& sparse_features,
	const std::vector<std::vector<float>>& dense_features,
	const std::function<bool(const std::vector<int>&,
	const std::vector<float>&, float*)>&
	feature_vector_fn);

	// Remaps extracted features to V0 feature format. The mapping is using
	// the v0_feature_storage_ as the backing storage for the mapped features.
	// For each token the features consist of:
	// - chargram embeddings
	// - dense features
	// They are concatenated together as [chargram embeddings; dense features]
	// for each token independently.
	// The V0 features require that the chargram embeddings for tokens are
	// concatenated first together, and at the end, the dense features for the
	// tokens are concatenated to it.
	void RemapV0FeatureVector(VectorSpan<float>* features);

	private:
	const VectorSpan<Token> tokens_;
	const int context_size_;
	const int feature_vector_size_;
	bool remap_v0_feature_vector_;
	int remap_v0_chargram_embedding_size_;

	std::vector<float> features_;
	std::vector<float> v0_feature_storage_;
	};

	} // namespace libtextclassifier

	#endif // LIBTEXTCLASSIFIER_SMARTSELECT_CACHED_FEATURES_H_