annotator/duration/duration.h - platform/external/libtextclassifier - Git at Google

 /*
  * Copyright (C) 2018 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
 #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_

 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>

 #include "annotator/feature-processor.h"
 #include "annotator/model_generated.h"
 #include "annotator/types.h"
 #include "utils/utf8/unicodetext.h"

 namespace libtextclassifier3 {

 namespace internal {
 enum class DurationUnit {
   UNKNOWN = -1,
   WEEK = 0,
   DAY = 1,
   HOUR = 2,
   MINUTE = 3,
   SECOND = 4

   // NOTE: If we want to add MONTH and YEAR we'll have to think of different
   // parsing format, because MONTH and YEAR don't have a fixed number of
   // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight
   // savings time and assume the day is always 24 hours.
 };

 // Prepares the mapping between token values and duration unit types.
 std::unordered_map<std::string, internal::DurationUnit>
 BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options);

 // Creates a set of strings from a flatbuffer string vector.
 std::unordered_set<std::string> BuildStringSet(
     const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*);

 }  // namespace internal

 // Annotator of duration expressions like "3 minutes 30 seconds".
 class DurationAnnotator {
  public:
   explicit DurationAnnotator(const DurationAnnotatorOptions* options,
                              const FeatureProcessor* feature_processor)
       : options_(options),
         feature_processor_(feature_processor),
         token_value_to_duration_unit_(
             internal::BuildTokenToDurationUnitMapping(options)),
         filler_expressions_(
             internal::BuildStringSet(options->filler_expressions())),
         half_expressions_(
             internal::BuildStringSet(options->half_expressions())) {}

   // Classifies given text, and if it is a duration, it passes the result in
   // 'classification_result' and returns true, otherwise returns false.
   bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
                     AnnotationUsecase annotation_usecase,
                     ClassificationResult* classification_result) const;

   // Finds all duration instances in the input text.
   bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens,
                AnnotationUsecase annotation_usecase,
                std::vector<AnnotatedSpan>* results) const;

  private:
   // Represents a component of duration parsed from text (e.g. "3 hours" from
   // the expression "3 hours and 20 minutes").
   struct ParsedDurationAtom {
     // Unit of the duration.
     internal::DurationUnit unit = internal::DurationUnit::UNKNOWN;

     // Quantity of the duration unit.
     int value = 0;

     // True, if half an unit was specified (either in addition, or exclusively).
     // E.g. "hour and a half".
     // NOTE: Quarter, three-quarters etc. is not supported.
     bool plus_half = false;

     static ParsedDurationAtom Half() {
       ParsedDurationAtom result;
       result.plus_half = true;
       return result;
     }
   };

   // Starts consuming tokens and returns the index past the last consumed token.
   int FindDurationStartingAt(const UnicodeText& context,
                              const std::vector<Token>& tokens,
                              int start_token_index,
                              AnnotatedSpan* result) const;

   bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const;
   bool ParseDurationUnitToken(const Token& token,
                               internal::DurationUnit* duration_unit) const;
   bool ParseFillerToken(const Token& token) const;

   int64 ParsedDurationAtomsToMillis(
       const std::vector<ParsedDurationAtom>& atoms) const;

   const DurationAnnotatorOptions* options_;
   const FeatureProcessor* feature_processor_;
   const std::unordered_map<std::string, internal::DurationUnit>
       token_value_to_duration_unit_;
   const std::unordered_set<std::string> filler_expressions_;
   const std::unordered_set<std::string> half_expressions_;
 };

 }  // namespace libtextclassifier3

 #endif  // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
	/*
	* Copyright (C) 2018 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_
	#define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_

	#include <string>
	#include <unordered_map>
	#include <unordered_set>
	#include <vector>

	#include "annotator/feature-processor.h"
	#include "annotator/model_generated.h"
	#include "annotator/types.h"
	#include "utils/utf8/unicodetext.h"

	namespace libtextclassifier3 {

	namespace internal {
	enum class DurationUnit {
	UNKNOWN = -1,
	WEEK = 0,
	DAY = 1,
	HOUR = 2,
	MINUTE = 3,
	SECOND = 4

	// NOTE: If we want to add MONTH and YEAR we'll have to think of different
	// parsing format, because MONTH and YEAR don't have a fixed number of
	// milliseconds, unlike week/day/hour/minute/second. We ignore the daylight
	// savings time and assume the day is always 24 hours.
	};

	// Prepares the mapping between token values and duration unit types.
	std::unordered_map<std::string, internal::DurationUnit>
	BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options);

	// Creates a set of strings from a flatbuffer string vector.
	std::unordered_set<std::string> BuildStringSet(
	const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*);

	} // namespace internal

	// Annotator of duration expressions like "3 minutes 30 seconds".
	class DurationAnnotator {
	public:
	explicit DurationAnnotator(const DurationAnnotatorOptions* options,
	const FeatureProcessor* feature_processor)
	: options_(options),
	feature_processor_(feature_processor),
	token_value_to_duration_unit_(
	internal::BuildTokenToDurationUnitMapping(options)),
	filler_expressions_(
	internal::BuildStringSet(options->filler_expressions())),
	half_expressions_(
	internal::BuildStringSet(options->half_expressions())) {}

	// Classifies given text, and if it is a duration, it passes the result in
	// 'classification_result' and returns true, otherwise returns false.
	bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices,
	AnnotationUsecase annotation_usecase,
	ClassificationResult* classification_result) const;

	// Finds all duration instances in the input text.
	bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens,
	AnnotationUsecase annotation_usecase,
	std::vector<AnnotatedSpan>* results) const;

	private:
	// Represents a component of duration parsed from text (e.g. "3 hours" from
	// the expression "3 hours and 20 minutes").
	struct ParsedDurationAtom {
	// Unit of the duration.
	internal::DurationUnit unit = internal::DurationUnit::UNKNOWN;

	// Quantity of the duration unit.
	int value = 0;

	// True, if half an unit was specified (either in addition, or exclusively).
	// E.g. "hour and a half".
	// NOTE: Quarter, three-quarters etc. is not supported.
	bool plus_half = false;

	static ParsedDurationAtom Half() {
	ParsedDurationAtom result;
	result.plus_half = true;
	return result;
	}
	};

	// Starts consuming tokens and returns the index past the last consumed token.
	int FindDurationStartingAt(const UnicodeText& context,
	const std::vector<Token>& tokens,
	int start_token_index,
	AnnotatedSpan* result) const;

	bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const;
	bool ParseDurationUnitToken(const Token& token,
	internal::DurationUnit* duration_unit) const;
	bool ParseFillerToken(const Token& token) const;

	int64 ParsedDurationAtomsToMillis(
	const std::vector<ParsedDurationAtom>& atoms) const;

	const DurationAnnotatorOptions* options_;
	const FeatureProcessor* feature_processor_;
	const std::unordered_map<std::string, internal::DurationUnit>
	token_value_to_duration_unit_;
	const std::unordered_set<std::string> filler_expressions_;
	const std::unordered_set<std::string> half_expressions_;
	};

	} // namespace libtextclassifier3

	#endif // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_