benchmarks/static_runtime/deep_wide_pt.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <ATen/CPUFunctions.h>
 #include <ATen/NativeFunctions.h>
 #include <torch/torch.h>

 struct DeepAndWide : torch::nn::Module {
   DeepAndWide(int num_features = 50) {
     mu_ = register_parameter("mu_", torch::randn({1, num_features}));
     sigma_ = register_parameter("sigma_", torch::randn({1, num_features}));
     fc_w_ = register_parameter("fc_w_", torch::randn({1, num_features + 1}));
     fc_b_ = register_parameter("fc_b_", torch::randn({1}));
   }

   torch::Tensor forward(
       torch::Tensor ad_emb_packed,
       torch::Tensor user_emb,
       torch::Tensor wide) {
     auto wide_offset = wide + mu_;
     auto wide_normalized = wide_offset * sigma_;
     auto wide_noNaN = wide_normalized;
     // Placeholder for ReplaceNaN
     auto wide_preproc = torch::clamp(wide_noNaN, -10.0, 10.0);

     auto user_emb_t = torch::transpose(user_emb, 1, 2);
     auto dp_unflatten = torch::bmm(ad_emb_packed, user_emb_t);
     auto dp = torch::flatten(dp_unflatten, 1);
     auto input = torch::cat({dp, wide_preproc}, 1);
     auto fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
     auto pred = torch::sigmoid(fc1);
     return pred;
   }
   torch::Tensor mu_, sigma_, fc_w_, fc_b_;
 };

 // Implementation using native functions and pre-allocated tensors.
 // It could be used as a "speed of light" for static runtime.
 struct DeepAndWideFast : torch::nn::Module {
   DeepAndWideFast(int num_features = 50) {
     mu_ = register_parameter("mu_", torch::randn({1, num_features}));
     sigma_ = register_parameter("sigma_", torch::randn({1, num_features}));
     fc_w_ = register_parameter("fc_w_", torch::randn({1, num_features + 1}));
     fc_b_ = register_parameter("fc_b_", torch::randn({1}));
     allocated = false;
     prealloc_tensors = {};
   }

   torch::Tensor forward(
       torch::Tensor ad_emb_packed,
       torch::Tensor user_emb,
       torch::Tensor wide) {
     torch::NoGradGuard no_grad;
     if (!allocated) {
       auto wide_offset = at::add(wide, mu_);
       auto wide_normalized = at::mul(wide_offset, sigma_);
       // Placeholder for ReplaceNaN
       auto wide_preproc = at::cpu::clamp(wide_normalized, -10.0, 10.0);

       auto user_emb_t = at::native::transpose(user_emb, 1, 2);
       auto dp_unflatten = at::cpu::bmm(ad_emb_packed, user_emb_t);
       // auto dp = at::native::flatten(dp_unflatten, 1);
       auto dp = dp_unflatten.view({dp_unflatten.size(0), 1});
       auto input = at::cpu::cat({dp, wide_preproc}, 1);

       // fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
       fc_w_t_ = torch::t(fc_w_);
       auto fc1 = torch::addmm(fc_b_, input, fc_w_t_);

       auto pred = at::cpu::sigmoid(fc1);

       prealloc_tensors = {
           wide_offset,
           wide_normalized,
           wide_preproc,
           user_emb_t,
           dp_unflatten,
           dp,
           input,
           fc1,
           pred};
       allocated = true;

       return pred;
     } else {
       // Potential optimization: add and mul could be fused together (e.g. with
       // Eigen).
       at::add_out(prealloc_tensors[0], wide, mu_);
       at::mul_out(prealloc_tensors[1], prealloc_tensors[0], sigma_);

       at::native::clip_out(
           prealloc_tensors[1], -10.0, 10.0, prealloc_tensors[2]);

       // Potential optimization: original tensor could be pre-transposed.
       // prealloc_tensors[3] = at::native::transpose(user_emb, 1, 2);
       if (prealloc_tensors[3].data_ptr() != user_emb.data_ptr()) {
         auto sizes = user_emb.sizes();
         auto strides = user_emb.strides();
         prealloc_tensors[3].set_(
             user_emb.storage(),
             0,
             {sizes[0], sizes[2], sizes[1]},
             {strides[0], strides[2], strides[1]});
       }

       // Potential optimization: call MKLDNN directly.
       at::cpu::bmm_out(ad_emb_packed, prealloc_tensors[3], prealloc_tensors[4]);

       if (prealloc_tensors[5].data_ptr() != prealloc_tensors[4].data_ptr()) {
         // in unlikely case that the input tensor changed we need to
         // reinitialize the view
         prealloc_tensors[5] =
             prealloc_tensors[4].view({prealloc_tensors[4].size(0), 1});
       }

       // Potential optimization: we can replace cat with carefully constructed
       // tensor views on the output that are passed to the _out ops above.
       at::cpu::cat_outf(
           {prealloc_tensors[5], prealloc_tensors[2]}, 1, prealloc_tensors[6]);
       at::cpu::addmm_out(
           prealloc_tensors[7], fc_b_, prealloc_tensors[6], fc_w_t_, 1, 1);
       at::cpu::sigmoid_out(prealloc_tensors[7], prealloc_tensors[8]);

       return prealloc_tensors[8];
     }
   }
   torch::Tensor mu_, sigma_, fc_w_, fc_b_, fc_w_t_;
   std::vector<torch::Tensor> prealloc_tensors;
   bool allocated = false;
 };

 torch::jit::Module getDeepAndWideSciptModel(int num_features = 50);

 torch::jit::Module getTrivialScriptModel();

 torch::jit::Module getLeakyReLUScriptModel();

 torch::jit::Module getLeakyReLUConstScriptModel();

 torch::jit::Module getLongScriptModel();

 torch::jit::Module getSignedLog1pModel();
	#pragma once

	#include <ATen/CPUFunctions.h>
	#include <ATen/NativeFunctions.h>
	#include <torch/torch.h>

	struct DeepAndWide : torch::nn::Module {
	DeepAndWide(int num_features = 50) {
	mu_ = register_parameter("mu_", torch::randn({1, num_features}));
	sigma_ = register_parameter("sigma_", torch::randn({1, num_features}));
	fc_w_ = register_parameter("fc_w_", torch::randn({1, num_features + 1}));
	fc_b_ = register_parameter("fc_b_", torch::randn({1}));
	}

	torch::Tensor forward(
	torch::Tensor ad_emb_packed,
	torch::Tensor user_emb,
	torch::Tensor wide) {
	auto wide_offset = wide + mu_;
	auto wide_normalized = wide_offset * sigma_;
	auto wide_noNaN = wide_normalized;
	// Placeholder for ReplaceNaN
	auto wide_preproc = torch::clamp(wide_noNaN, -10.0, 10.0);

	auto user_emb_t = torch::transpose(user_emb, 1, 2);
	auto dp_unflatten = torch::bmm(ad_emb_packed, user_emb_t);
	auto dp = torch::flatten(dp_unflatten, 1);
	auto input = torch::cat({dp, wide_preproc}, 1);
	auto fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
	auto pred = torch::sigmoid(fc1);
	return pred;
	}
	torch::Tensor mu_, sigma_, fc_w_, fc_b_;
	};

	// Implementation using native functions and pre-allocated tensors.
	// It could be used as a "speed of light" for static runtime.
	struct DeepAndWideFast : torch::nn::Module {
	DeepAndWideFast(int num_features = 50) {
	mu_ = register_parameter("mu_", torch::randn({1, num_features}));
	sigma_ = register_parameter("sigma_", torch::randn({1, num_features}));
	fc_w_ = register_parameter("fc_w_", torch::randn({1, num_features + 1}));
	fc_b_ = register_parameter("fc_b_", torch::randn({1}));
	allocated = false;
	prealloc_tensors = {};
	}

	torch::Tensor forward(
	torch::Tensor ad_emb_packed,
	torch::Tensor user_emb,
	torch::Tensor wide) {
	torch::NoGradGuard no_grad;
	if (!allocated) {
	auto wide_offset = at::add(wide, mu_);
	auto wide_normalized = at::mul(wide_offset, sigma_);
	// Placeholder for ReplaceNaN
	auto wide_preproc = at::cpu::clamp(wide_normalized, -10.0, 10.0);

	auto user_emb_t = at::native::transpose(user_emb, 1, 2);
	auto dp_unflatten = at::cpu::bmm(ad_emb_packed, user_emb_t);
	// auto dp = at::native::flatten(dp_unflatten, 1);
	auto dp = dp_unflatten.view({dp_unflatten.size(0), 1});
	auto input = at::cpu::cat({dp, wide_preproc}, 1);

	// fc1 = torch::nn::functional::linear(input, fc_w_, fc_b_);
	fc_w_t_ = torch::t(fc_w_);
	auto fc1 = torch::addmm(fc_b_, input, fc_w_t_);

	auto pred = at::cpu::sigmoid(fc1);

	prealloc_tensors = {
	wide_offset,
	wide_normalized,
	wide_preproc,
	user_emb_t,
	dp_unflatten,
	dp,
	input,
	fc1,
	pred};
	allocated = true;

	return pred;
	} else {
	// Potential optimization: add and mul could be fused together (e.g. with
	// Eigen).
	at::add_out(prealloc_tensors[0], wide, mu_);
	at::mul_out(prealloc_tensors[1], prealloc_tensors[0], sigma_);

	at::native::clip_out(
	prealloc_tensors[1], -10.0, 10.0, prealloc_tensors[2]);

	// Potential optimization: original tensor could be pre-transposed.
	// prealloc_tensors[3] = at::native::transpose(user_emb, 1, 2);
	if (prealloc_tensors[3].data_ptr() != user_emb.data_ptr()) {
	auto sizes = user_emb.sizes();
	auto strides = user_emb.strides();
	prealloc_tensors[3].set_(
	user_emb.storage(),
	0,
	{sizes[0], sizes[2], sizes[1]},
	{strides[0], strides[2], strides[1]});
	}

	// Potential optimization: call MKLDNN directly.
	at::cpu::bmm_out(ad_emb_packed, prealloc_tensors[3], prealloc_tensors[4]);

	if (prealloc_tensors[5].data_ptr() != prealloc_tensors[4].data_ptr()) {
	// in unlikely case that the input tensor changed we need to
	// reinitialize the view
	prealloc_tensors[5] =
	prealloc_tensors[4].view({prealloc_tensors[4].size(0), 1});
	}

	// Potential optimization: we can replace cat with carefully constructed
	// tensor views on the output that are passed to the _out ops above.
	at::cpu::cat_outf(
	{prealloc_tensors[5], prealloc_tensors[2]}, 1, prealloc_tensors[6]);
	at::cpu::addmm_out(
	prealloc_tensors[7], fc_b_, prealloc_tensors[6], fc_w_t_, 1, 1);
	at::cpu::sigmoid_out(prealloc_tensors[7], prealloc_tensors[8]);

	return prealloc_tensors[8];
	}
	}
	torch::Tensor mu_, sigma_, fc_w_, fc_b_, fc_w_t_;
	std::vector<torch::Tensor> prealloc_tensors;
	bool allocated = false;
	};

	torch::jit::Module getDeepAndWideSciptModel(int num_features = 50);

	torch::jit::Module getTrivialScriptModel();

	torch::jit::Module getLeakyReLUScriptModel();

	torch::jit::Module getLeakyReLUConstScriptModel();

	torch::jit::Module getLongScriptModel();

	torch::jit::Module getSignedLog1pModel();