/*
 * Copyright (C) 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_LANG_ID_LANG_ID_H_
#define LIBTEXTCLASSIFIER_LANG_ID_LANG_ID_H_

// Clients who want to perform language identification should use this header.
//
// Note for lang id implementors: keep this header as linght as possible.  E.g.,
// any macro defined here (or in a transitively #included file) is a potential
// name conflict with our clients.

#include <memory>
#include <string>
#include <vector>

#include "util/base/macros.h"

namespace libtextclassifier {
namespace nlp_core {
namespace lang_id {

// Forward-declaration of the class that performs all underlying work.
class LangIdImpl;

// Class for detecting the language of a document.
//
// NOTE: this class is go/thread-unsafe.
class LangId {
 public:
  // Constructs a LangId object, loading an EmbeddingNetworkProto model from the
  // indicated file.
  //
  // Note: we don't crash if we detect a problem at construction time (e.g.,
  // file doesn't exist, or its content is corrupted).  Instead, we mark the
  // newly-constructed object as invalid; clients can invoke FindLanguage() on
  // an invalid object: nothing crashes, but accuracy will be bad.
  explicit LangId(const std::string &filename);

  // Same as above but uses a file descriptor.
  explicit LangId(int fd);

  virtual ~LangId();

  // Sets probability threshold for predictions.  If our likeliest prediction is
  // below this threshold, we report the default language (see
  // SetDefaultLanguage()).  Othewise, we report the likelist language.
  //
  // By default (if this method is not called) we use the probability threshold
  // stored in the model, as the task parameter "reliability_thresh".  If that
  // task parameter is not specified, we use 0.5.  A client can use this method
  // to get a different precision / recall trade-off.  The higher the threshold,
  // the higher the precision and lower the recall rate.
  void SetProbabilityThreshold(float threshold);

  // Sets default language to report if errors prevent running the real
  // inference code or if prediction confidence is too small.
  void SetDefaultLanguage(const std::string &lang);

  // Returns language code for the most likely language that text is written in.
  // Note: if this LangId object is not valid (see
  // is_valid()), this method returns the default language specified via
  // SetDefaultLanguage() or (if that method was never invoked), the empty
  // std::string.
  std::string FindLanguage(const std::string &text) const;

  // Returns a vector of language codes along with the probability for each
  // language.  The result contains at least one element.  The sum of
  // probabilities may be less than 1.0.
  std::vector<std::pair<std::string, float>> FindLanguages(
      const std::string &text) const;

  // Returns true if this object has been correctly initialized and is ready to
  // perform predictions.  For more info, see doc for LangId
  // constructor above.
  bool is_valid() const;

 private:
  // Returns a vector of probabilities of languages of the text.
  std::vector<float> ScoreLanguages(const std::string &text) const;

  // Pimpl ("pointer to implementation") pattern, to hide all internals from our
  // clients.
  std::unique_ptr<LangIdImpl> pimpl_;

  TC_DISALLOW_COPY_AND_ASSIGN(LangId);
};

}  // namespace lang_id
}  // namespace nlp_core
}  // namespace libtextclassifier

#endif  // LIBTEXTCLASSIFIER_LANG_ID_LANG_ID_H_
