examples/llm_manual/main.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <cstdint>

 #include "basic_sampler.h"
 #include "basic_tokenizer.h"

 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/result.h>

 using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using executorch::extension::Module;
 using executorch::runtime::EValue;
 using executorch::runtime::Result;

 // The value of the gpt2 `<|endoftext|>` token.
 #define ENDOFTEXT 50256

 std::string generate(
     Module& llm_model,
     std::string& prompt,
     BasicTokenizer& tokenizer,
     BasicSampler& sampler,
     size_t max_input_length,
     size_t max_output_length) {
   // Convert the input text into a list of integers (tokens) that represents
   // it, using the string-to-token mapping that the model was trained on.
   // Each token is an integer that represents a word or part of a word.
   std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
   std::vector<int64_t> output_tokens;

   for (auto i = 0u; i < max_output_length; i++) {
     // Convert the input_tokens from a vector of int64_t to EValue.
     // EValue is a unified data type in the ExecuTorch runtime.
     auto inputs = from_blob(
         input_tokens.data(),
         {1, static_cast<int>(input_tokens.size())},
         ScalarType::Long);

     // Run the model. It will return a tensor of logits (log-probabilities).
     auto logits_evalue = llm_model.forward(inputs);

     // Convert the output logits from EValue to std::vector, which is what
     // the sampler expects.
     Tensor logits_tensor = logits_evalue.get()[0].toTensor();
     std::vector<float> logits(
         logits_tensor.data_ptr<float>(),
         logits_tensor.data_ptr<float>() + logits_tensor.numel());

     // Sample the next token from the logits.
     int64_t next_token = sampler.sample(logits);

     // Break if we reached the end of the text.
     if (next_token == ENDOFTEXT) {
       break;
     }

     // Add the next token to the output.
     output_tokens.push_back(next_token);

     std::cout << tokenizer.decode({next_token});
     std::cout.flush();

     // Update next input.
     input_tokens.push_back(next_token);
     if (input_tokens.size() > max_input_length) {
       input_tokens.erase(input_tokens.begin());
     }
   }

   std::cout << std::endl;

   // Convert the output tokens into a human-readable string.
   std::string output_string = tokenizer.decode(output_tokens);
   return output_string;
 }

 // main.cpp

 int main() {
   // Set up the prompt. This provides the seed text for the model to elaborate.
   std::cout << "Prompt: ";
   std::string prompt;
   std::getline(std::cin, prompt);

   // The tokenizer is used to convert between tokens (used by the model) and
   // human-readable strings.
   BasicTokenizer tokenizer("vocab.json");

   // The sampler is used to sample the next token from the logits.
   BasicSampler sampler = BasicSampler();

   // Load the exported nanoGPT program, which was generated via the previous
   // steps.
   Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);

   const auto max_input_tokens = 1024;
   const auto max_output_tokens = 30;
   std::cout << prompt;
   generate(
       model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
 }
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <cstdint>

	#include "basic_sampler.h"
	#include "basic_tokenizer.h"

	#include <executorch/extension/module/module.h>
	#include <executorch/extension/tensor/tensor.h>
	#include <executorch/runtime/core/evalue.h>
	#include <executorch/runtime/core/exec_aten/exec_aten.h>
	#include <executorch/runtime/core/result.h>

	using exec_aten::ScalarType;
	using exec_aten::Tensor;
	using executorch::extension::Module;
	using executorch::runtime::EValue;
	using executorch::runtime::Result;

	// The value of the gpt2 `<\|endoftext\|>` token.
	#define ENDOFTEXT 50256

	std::string generate(
	Module& llm_model,
	std::string& prompt,
	BasicTokenizer& tokenizer,
	BasicSampler& sampler,
	size_t max_input_length,
	size_t max_output_length) {
	// Convert the input text into a list of integers (tokens) that represents
	// it, using the string-to-token mapping that the model was trained on.
	// Each token is an integer that represents a word or part of a word.
	std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
	std::vector<int64_t> output_tokens;

	for (auto i = 0u; i < max_output_length; i++) {
	// Convert the input_tokens from a vector of int64_t to EValue.
	// EValue is a unified data type in the ExecuTorch runtime.
	auto inputs = from_blob(
	input_tokens.data(),
	{1, static_cast<int>(input_tokens.size())},
	ScalarType::Long);

	// Run the model. It will return a tensor of logits (log-probabilities).
	auto logits_evalue = llm_model.forward(inputs);

	// Convert the output logits from EValue to std::vector, which is what
	// the sampler expects.
	Tensor logits_tensor = logits_evalue.get()[0].toTensor();
	std::vector<float> logits(
	logits_tensor.data_ptr<float>(),
	logits_tensor.data_ptr<float>() + logits_tensor.numel());

	// Sample the next token from the logits.
	int64_t next_token = sampler.sample(logits);

	// Break if we reached the end of the text.
	if (next_token == ENDOFTEXT) {
	break;
	}

	// Add the next token to the output.
	output_tokens.push_back(next_token);

	std::cout << tokenizer.decode({next_token});
	std::cout.flush();

	// Update next input.
	input_tokens.push_back(next_token);
	if (input_tokens.size() > max_input_length) {
	input_tokens.erase(input_tokens.begin());
	}
	}

	std::cout << std::endl;

	// Convert the output tokens into a human-readable string.
	std::string output_string = tokenizer.decode(output_tokens);
	return output_string;
	}

	// main.cpp

	int main() {
	// Set up the prompt. This provides the seed text for the model to elaborate.
	std::cout << "Prompt: ";
	std::string prompt;
	std::getline(std::cin, prompt);

	// The tokenizer is used to convert between tokens (used by the model) and
	// human-readable strings.
	BasicTokenizer tokenizer("vocab.json");

	// The sampler is used to sample the next token from the logits.
	BasicSampler sampler = BasicSampler();

	// Load the exported nanoGPT program, which was generated via the previous
	// steps.
	Module model("nanogpt.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);

	const auto max_input_tokens = 1024;
	const auto max_output_tokens = 30;
	std::cout << prompt;
	generate(
	model, prompt, tokenizer, sampler, max_input_tokens, max_output_tokens);
	}