examples/models/phi-3-mini/runner.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <executorch/examples/models/phi-3-mini/runner.h>

 #include <ctime>
 #include <iostream>

 #include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/platform/log.h>

 namespace torch::executor {

 #define SAMPLER_TOP 0.9f
 #define ENDOFTEXT_TOKEN 32000
 #define VOCABULARY_SIZE 32064

 Runner::Runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     const float temperature)
     : module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
       tokenizer_(std::make_unique<BPETokenizer>()),
       sampler_(std::make_unique<Sampler>(
           VOCABULARY_SIZE,
           temperature,
           SAMPLER_TOP,
           static_cast<unsigned long long>(std::time(nullptr)))) {
   ET_CHECK_MSG(
       tokenizer_->load(tokenizer_path) == Error::Ok,
       "Failed to load tokenizer at %s",
       tokenizer_path.c_str());
   ET_LOG(
       Info,
       "Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s",
       model_path.c_str(),
       tokenizer_path.c_str());
 }

 void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
   auto encode_res = tokenizer_->encode(prompt, 0, 0);
   ET_CHECK_MSG(
       encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str());
   auto input_tokens = encode_res.get();

   std::cout << "Prefilling tokens ..." << std::endl;
   for (auto token : input_tokens) {
     std::cout << token << " ";
   }
   std::cout << std::endl;
   std::cout.flush();
   auto prev_token = input_tokens.back();
   auto current_token = prefill(input_tokens);

   std::cout << "Generating tokens ..." << std::endl;
   std::cout << tokenizer_->decode(prev_token, current_token).get();
   std::cout.flush();

   std::size_t seq_len = input_tokens.size() + 1;

   while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) {
     prev_token = current_token;
     current_token = run_model_step(current_token);
     std::cout << tokenizer_->decode(prev_token, current_token).get();
     std::cout.flush();

     ++seq_len;
   }

   std::cout << std::endl;
 }

 uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
   return sampler_->sample(logits_tensor.data_ptr<float>());
 }

 uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
   auto result = module_->forward(from_blob(
       tokens.data(),
       {1, static_cast<exec_aten::SizesType>(tokens.size())},
       ScalarType::Long));
   ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");

   return logits_to_token(result.get()[0].toTensor());
 }

 uint64_t Runner::run_model_step(uint64_t token) {
   auto result = module_->forward(from_blob(&token, {1, 1}, ScalarType::Long));
   ET_CHECK_MSG(
       result.error() == Error::Ok,
       "Failed to run forward() for token %" PRIu64,
       token);

   return logits_to_token(result.get()[0].toTensor());
 }

 } // namespace torch::executor
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <executorch/examples/models/phi-3-mini/runner.h>

	#include <ctime>
	#include <iostream>

	#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
	#include <executorch/extension/tensor/tensor.h>
	#include <executorch/runtime/platform/log.h>

	namespace torch::executor {

	#define SAMPLER_TOP 0.9f
	#define ENDOFTEXT_TOKEN 32000
	#define VOCABULARY_SIZE 32064

	Runner::Runner(
	const std::string& model_path,
	const std::string& tokenizer_path,
	const float temperature)
	: module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
	tokenizer_(std::make_unique<BPETokenizer>()),
	sampler_(std::make_unique<Sampler>(
	VOCABULARY_SIZE,
	temperature,
	SAMPLER_TOP,
	static_cast<unsigned long long>(std::time(nullptr)))) {
	ET_CHECK_MSG(
	tokenizer_->load(tokenizer_path) == Error::Ok,
	"Failed to load tokenizer at %s",
	tokenizer_path.c_str());
	ET_LOG(
	Info,
	"Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s",
	model_path.c_str(),
	tokenizer_path.c_str());
	}

	void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
	auto encode_res = tokenizer_->encode(prompt, 0, 0);
	ET_CHECK_MSG(
	encode_res.error() == Error::Ok, "Failed to encode %s", prompt.c_str());
	auto input_tokens = encode_res.get();

	std::cout << "Prefilling tokens ..." << std::endl;
	for (auto token : input_tokens) {
	std::cout << token << " ";
	}
	std::cout << std::endl;
	std::cout.flush();
	auto prev_token = input_tokens.back();
	auto current_token = prefill(input_tokens);

	std::cout << "Generating tokens ..." << std::endl;
	std::cout << tokenizer_->decode(prev_token, current_token).get();
	std::cout.flush();

	std::size_t seq_len = input_tokens.size() + 1;

	while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) {
	prev_token = current_token;
	current_token = run_model_step(current_token);
	std::cout << tokenizer_->decode(prev_token, current_token).get();
	std::cout.flush();

	++seq_len;
	}

	std::cout << std::endl;
	}

	uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
	return sampler_->sample(logits_tensor.data_ptr<float>());
	}

	uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
	auto result = module_->forward(from_blob(
	tokens.data(),
	{1, static_cast<exec_aten::SizesType>(tokens.size())},
	ScalarType::Long));
	ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");

	return logits_to_token(result.get()[0].toTensor());
	}

	uint64_t Runner::run_model_step(uint64_t token) {
	auto result = module_->forward(from_blob(&token, {1, 1}, ScalarType::Long));
	ET_CHECK_MSG(
	result.error() == Error::Ok,
	"Failed to run forward() for token %" PRIu64,
	token);

	return logits_to_token(result.get()[0].toTensor());
	}

	} // namespace torch::executor