| /* |
| * Copyright (C) 2018 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #ifndef LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ |
| #define LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ |
| |
| #include <string> |
| #include <unordered_map> |
| #include <unordered_set> |
| #include <vector> |
| |
| #include "annotator/feature-processor.h" |
| #include "annotator/model_generated.h" |
| #include "annotator/types.h" |
| #include "utils/utf8/unicodetext.h" |
| |
| namespace libtextclassifier3 { |
| |
| namespace internal { |
| enum class DurationUnit { |
| UNKNOWN = -1, |
| WEEK = 0, |
| DAY = 1, |
| HOUR = 2, |
| MINUTE = 3, |
| SECOND = 4 |
| |
| // NOTE: If we want to add MONTH and YEAR we'll have to think of different |
| // parsing format, because MONTH and YEAR don't have a fixed number of |
| // milliseconds, unlike week/day/hour/minute/second. We ignore the daylight |
| // savings time and assume the day is always 24 hours. |
| }; |
| |
| // Prepares the mapping between token values and duration unit types. |
| std::unordered_map<std::string, internal::DurationUnit> |
| BuildTokenToDurationUnitMapping(const DurationAnnotatorOptions* options); |
| |
| // Creates a set of strings from a flatbuffer string vector. |
| std::unordered_set<std::string> BuildStringSet( |
| const flatbuffers::Vector<flatbuffers::Offset<flatbuffers::String>>*); |
| |
| } // namespace internal |
| |
| // Annotator of duration expressions like "3 minutes 30 seconds". |
| class DurationAnnotator { |
| public: |
| explicit DurationAnnotator(const DurationAnnotatorOptions* options, |
| const FeatureProcessor* feature_processor) |
| : options_(options), |
| feature_processor_(feature_processor), |
| token_value_to_duration_unit_( |
| internal::BuildTokenToDurationUnitMapping(options)), |
| filler_expressions_( |
| internal::BuildStringSet(options->filler_expressions())), |
| half_expressions_( |
| internal::BuildStringSet(options->half_expressions())) {} |
| |
| // Classifies given text, and if it is a duration, it passes the result in |
| // 'classification_result' and returns true, otherwise returns false. |
| bool ClassifyText(const UnicodeText& context, CodepointSpan selection_indices, |
| AnnotationUsecase annotation_usecase, |
| ClassificationResult* classification_result) const; |
| |
| // Finds all duration instances in the input text. |
| bool FindAll(const UnicodeText& context, const std::vector<Token>& tokens, |
| AnnotationUsecase annotation_usecase, |
| std::vector<AnnotatedSpan>* results) const; |
| |
| private: |
| // Represents a component of duration parsed from text (e.g. "3 hours" from |
| // the expression "3 hours and 20 minutes"). |
| struct ParsedDurationAtom { |
| // Unit of the duration. |
| internal::DurationUnit unit = internal::DurationUnit::UNKNOWN; |
| |
| // Quantity of the duration unit. |
| int value = 0; |
| |
| // True, if half an unit was specified (either in addition, or exclusively). |
| // E.g. "hour and a half". |
| // NOTE: Quarter, three-quarters etc. is not supported. |
| bool plus_half = false; |
| |
| static ParsedDurationAtom Half() { |
| ParsedDurationAtom result; |
| result.plus_half = true; |
| return result; |
| } |
| }; |
| |
| // Starts consuming tokens and returns the index past the last consumed token. |
| int FindDurationStartingAt(const UnicodeText& context, |
| const std::vector<Token>& tokens, |
| int start_token_index, |
| AnnotatedSpan* result) const; |
| |
| bool ParseQuantityToken(const Token& token, ParsedDurationAtom* value) const; |
| bool ParseDurationUnitToken(const Token& token, |
| internal::DurationUnit* duration_unit) const; |
| bool ParseFillerToken(const Token& token) const; |
| |
| int64 ParsedDurationAtomsToMillis( |
| const std::vector<ParsedDurationAtom>& atoms) const; |
| |
| const DurationAnnotatorOptions* options_; |
| const FeatureProcessor* feature_processor_; |
| const std::unordered_map<std::string, internal::DurationUnit> |
| token_value_to_duration_unit_; |
| const std::unordered_set<std::string> filler_expressions_; |
| const std::unordered_set<std::string> half_expressions_; |
| }; |
| |
| } // namespace libtextclassifier3 |
| |
| #endif // LIBTEXTCLASSIFIER_ANNOTATOR_DURATION_DURATION_H_ |