tensorflow/lite/delegates/gpu/common/convert.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/delegates/gpu/common/convert.h"

 #include <fp16.h>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"

 namespace tflite {
 namespace gpu {
 namespace {

 constexpr int kPhwc4ChannelsInPlane = 4;
 constexpr int kPhwo4i4ChannelsInPlane = 4;
 constexpr int kPiohw4ChannelsInPlane = 4;

 // Layout is Po,H,W,OI4x4.
 Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
                         absl::Span<float> out, bool reverse_space) {
   if (in.size() != shape.DimensionsProduct()) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWO4I4: Input data size does not match expected size: ",
         in.size(), " != ", shape.DimensionsProduct()));
   }
   if (out.size() != GetElementsSizeForPHWO4I4(shape)) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWO4I4: Output data size does not match expected size: ",
         out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
   }

   float* output = out.data();
   for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
        ++p) {
     for (int h = 0; h < shape.h; ++h) {
       for (int w = 0; w < shape.w; ++w) {
         for (int c = 0;
              c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
           for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
             for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
               float value = 0;
               if (c * kPhwo4i4ChannelsInPlane + ci < shape.i &&
                   p * kPhwo4i4ChannelsInPlane + co < shape.o) {
                 // tensor is in OHWI
                 int tensor_o = p * kPhwo4i4ChannelsInPlane + co;
                 int tensor_i = c * kPhwo4i4ChannelsInPlane + ci;
                 const int in_h = reverse_space ? shape.h - 1 - h : h;
                 const int in_w = reverse_space ? shape.w - 1 - w : w;
                 value = in[shape.LinearIndex({tensor_o, in_h, in_w, tensor_i})];
               }
               (*output++) = value;
             }
           }
         }
       }
     }
   }
   return OkStatus();
 }

 }  // namespace

 uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape) {
   return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
          AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
 }

 uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape) {
   return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
          AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
 }

 std::vector<float> ConvertToPHWO4I4(
     const Tensor<OHWI, DataType::FLOAT32>& tensor) {
   std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
   ConvertToPHWO4I4(tensor.data, tensor.shape,
                    absl::MakeSpan(transposed.data(), transposed.size()),
                    /*reverse_space=*/false)
       .IgnoreError();
   return transposed;
 }

 std::vector<float> ConvertToPHWO4I4Transposed(
     const Tensor<OHWI, DataType::FLOAT32>& tensor) {
   std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
   ConvertToPHWO4I4(tensor.data, tensor.shape,
                    absl::MakeSpan(transposed.data(), transposed.size()),
                    /*reverse_space=*/true)
       .IgnoreError();
   return transposed;
 }

 uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
   return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
                IntegralDivideRoundUp(shape.o, 4));
 }

 // Layout is Po,H,W,OI4x4.
 Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
                         absl::Span<float> out) {
   if (in.size() != shape.DimensionsProduct()) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWO4I4: Input data size does not match expected size: ",
         in.size(), " != ", shape.DimensionsProduct()));
   }
   if (out.size() != GetElementsSizeForPHWO4I4(shape)) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWO4I4: Output data size does not match expected size: ",
         out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
   }

   const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
   const int src_depth = IntegralDivideRoundUp(shape.i, 4);

   float* output = out.data();
   for (int f = 0; f < dst_depth; ++f) {
     for (int y = 0; y < shape.h; ++y) {
       for (int x = 0; x < shape.w; ++x) {
         for (int ch = 0; ch < src_depth; ++ch) {
           for (int co = 0; co < 4; ++co) {
             for (int ci = 0; ci < 4; ++ci) {
               const int src_channel = ch * 4 + ci;
               const int dst_channel = f * 4 + co;
               float value = 0;
               if (src_channel < shape.i && dst_channel < shape.o) {
                 // tensor is in IHWO
                 value = in[shape.LinearIndex({src_channel, y, x, dst_channel})];
               }
               (*output++) = value;
             }
           }
         }
       }
     }
   }
   return OkStatus();
 }

 std::vector<float> ConvertToPHWO4I4(
     const Tensor<IHWO, DataType::FLOAT32>& tensor) {
   std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
   ConvertToPHWO4I4(tensor.data, tensor.shape,
                    absl::MakeSpan(transposed.data(), transposed.size()))
       .IgnoreError();
   return transposed;
 }

 uint32_t GetElementsSizeForPIOHW4(const OHWI& shape) {
   return AlignByN(shape.o * shape.i, kPiohw4ChannelsInPlane) * shape.h *
          shape.w;
 }

 Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
                        absl::Span<float> out) {
   if (in.size() != shape.DimensionsProduct()) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPIOHW4: Input data size does not match expected size: ",
         in.size(), " != ", shape.DimensionsProduct()));
   }
   if (out.size() != GetElementsSizeForPIOHW4(shape)) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPIOHW4: Output data size does not match expected size: ",
         out.size(), " != ", GetElementsSizeForPIOHW4(shape)));
   }

   int32_t output_channels = shape.o * shape.i;
   int32_t num_planes =
       IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
   float* output = out.data();
   for (int p = 0; p < num_planes; ++p) {
     for (int h = 0; h < shape.h; ++h) {
       for (int w = 0; w < shape.w; ++w) {
         for (int c = 0; c < kPiohw4ChannelsInPlane; ++c) {
           int output_c = p * kPiohw4ChannelsInPlane + c;
           (*output++) = output_c >= output_channels
                             ? 0
                             : in[shape.LinearIndex({output_c % shape.o, h, w,
                                                     output_c / shape.o})];
         }
       }
     }
   }
   return OkStatus();
 }

 std::vector<float> ConvertToPIOHW4(
     const Tensor<OHWI, DataType::FLOAT32>& tensor) {
   std::vector<float> transposed(GetElementsSizeForPIOHW4(tensor.shape));
   ConvertToPIOHW4(tensor.data, tensor.shape,
                   absl::MakeSpan(transposed.data(), transposed.size()))
       .IgnoreError();
   return transposed;
 }

 template <typename T>
 Status ValidateConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
                               absl::Span<T> out) {
   if (in.size() != shape.DimensionsProduct()) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWC4: Input data size does not match expected size: ",
         in.size(), " != ", shape.DimensionsProduct()));
   }
   if (out.size() != GetElementsSizeForPHWC4(shape)) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertToPHWC4: Output data size does not match expected size: ",
         out.size(), " != ", GetElementsSizeForPHWC4(shape)));
   }
   return OkStatus();
 }

 // Layout is Pc,H,W,C4 where P - is a plane based on channels.
 Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
                       absl::Span<float> out) {
   RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
   if (shape.c == 4) {
     std::memcpy(out.data(), in.data(),
                 shape.DimensionsProduct() * sizeof(float));
     return OkStatus();
   }
   // Layout is Pc,H,W,C4 where P - is a plane based on channels.
   int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
   const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
   for (int b = 0; b < shape.b; b++) {
     float* dest =
         out.data() + b * num_pixels * num_planes * kPhwc4ChannelsInPlane;
     for (int p = 0; p < num_full_planes; p++) {
       const float* src =
           in.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
       for (int i = 0; i < num_pixels; i++) {
         std::memcpy(dest, src, kPhwc4ChannelsInPlane * sizeof(float));
         src += shape.c;
         dest += kPhwc4ChannelsInPlane;
       }
     }
   }

   // Padding last kPhwc4ChannelsInPlane-channel layer to multiple of
   // kPhwc4ChannelsInPlane.
   const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
   const int remaining_channels =
       shape.c - num_full_planes * kPhwc4ChannelsInPlane;
   if (remaining_channels == 0) {
     return OkStatus();
   }
   for (int b = 0; b < shape.b; b++) {
     const float* src =
         in.data() +
         shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
     float* dest = out.data() + b * padded_size +
                   num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
     for (int p = 0; p < num_pixels; p++) {
       std::memcpy(dest, src, remaining_channels * sizeof(float));
       std::memset(dest + remaining_channels, 0,
                   (4 - remaining_channels) * sizeof(float));
       src += shape.c;
       dest += kPhwc4ChannelsInPlane;
     }
   }
   return OkStatus();
 }

 // Layout is Pc,H,W,C4 where P - is a plane based on channels.
 Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
                           absl::Span<HalfBits> out) {
   RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));

   // Layout is Pc,H,W,C4 where P - is a plane based on channels.
   int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
   const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
   for (int b = 0; b < shape.b; b++) {
     HalfBits* dest =
         out.data() + b * num_pixels * num_planes * kPhwc4ChannelsInPlane;
     for (int p = 0; p < num_full_planes; p++) {
       const float* src =
           in.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
       for (int i = 0; i < num_pixels; i++) {
         dest[0] = fp16_ieee_from_fp32_value(src[0]);
         dest[1] = fp16_ieee_from_fp32_value(src[1]);
         dest[2] = fp16_ieee_from_fp32_value(src[2]);
         dest[3] = fp16_ieee_from_fp32_value(src[3]);
         src += shape.c;
         dest += kPhwc4ChannelsInPlane;
       }
     }
   }

   // Padding last kPhwc4ChannelsInPlane-channel layer to multiple of
   // kPhwc4ChannelsInPlane.
   const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
   const int remaining_channels =
       shape.c - num_full_planes * kPhwc4ChannelsInPlane;
   if (remaining_channels == 0) {
     return OkStatus();
   }

   for (int b = 0; b < shape.b; b++) {
     const float* src =
         in.data() +
         shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
     HalfBits* dest = out.data() + b * padded_size +
                      num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
     switch (remaining_channels) {
       case 1:
         for (int p = 0; p < num_pixels; p++) {
           dest[0] = fp16_ieee_from_fp32_value(src[0]);
           dest[1] = 0;
           dest[2] = 0;
           dest[3] = 0;
           src += shape.c;
           dest += kPhwc4ChannelsInPlane;
         }
         break;
       case 2:
         for (int p = 0; p < num_pixels; p++) {
           dest[0] = fp16_ieee_from_fp32_value(src[0]);
           dest[1] = fp16_ieee_from_fp32_value(src[1]);
           dest[2] = 0;
           dest[3] = 0;
           src += shape.c;
           dest += kPhwc4ChannelsInPlane;
         }
         break;
       case 3:
         for (int p = 0; p < num_pixels; p++) {
           dest[0] = fp16_ieee_from_fp32_value(src[0]);
           dest[1] = fp16_ieee_from_fp32_value(src[1]);
           dest[2] = fp16_ieee_from_fp32_value(src[2]);
           dest[3] = 0;
           src += shape.c;
           dest += kPhwc4ChannelsInPlane;
         }
         break;
       default:
         return UnimplementedError(
             "ConvertToPHWC4Half: Unsupported channels per planes count.");
     }
   }
   return OkStatus();
 }

 std::vector<float> ConvertToPHWC4(
     const Tensor<BHWC, DataType::FLOAT32>& tensor) {
   std::vector<float> transposed(GetElementsSizeForPHWC4(tensor.shape));
   ConvertToPHWC4(tensor.data, tensor.shape,
                  absl::MakeSpan(transposed.data(), transposed.size()))
       .IgnoreError();
   // TODO(akulik): Maybe safer to return Status.
   return transposed;
 }

 std::vector<float> ConvertToPHWC4(
     const Tensor<HWC, DataType::FLOAT32>& tensor) {
   const BHWC batched_shape =
       BHWC(1, tensor.shape.h, tensor.shape.w, tensor.shape.c);
   std::vector<float> transposed(GetElementsSizeForPHWC4(batched_shape));
   ConvertToPHWC4(tensor.data, batched_shape,
                  absl::MakeSpan(transposed.data(), transposed.size()))
       .IgnoreError();
   // TODO(akulik): Maybe safer to return Status.
   return transposed;
 }

 uint32_t GetElementsSizeForPHWC4(const BHWC& shape) {
   return shape.b * shape.h * shape.w * AlignByN(shape.c, kPhwc4ChannelsInPlane);
 }

 template <typename T>
 Status ValidateConvertFromPHWC4(absl::Span<const T> in, const BHWC& shape,
                                 absl::Span<float> out) {
   if (in.size() != GetElementsSizeForPHWC4(shape)) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertFromPHWC4: Input data size does not match expected size: ",
         in.size(), " != ", GetElementsSizeForPHWC4(shape)));
   }
   if (out.size() != shape.DimensionsProduct()) {
     return InvalidArgumentError(absl::StrCat(
         "ConvertFromPHWC4: Output data size does not match expected size: ",
         out.size(), " != ", shape.DimensionsProduct()));
   }
   return OkStatus();
 }

 Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
                         absl::Span<float> out) {
   RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
   if (shape.c == 4) {
     std::memcpy(out.data(), in.data(),
                 shape.DimensionsProduct() * sizeof(float));
     return OkStatus();
   }

   int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
   const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
   for (int b = 0; b < shape.b; b++) {
     const float* src = in.data() + b * padded_size;
     for (int p = 0; p < num_full_planes; p++) {
       float* dest =
           out.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
       for (int i = 0; i < num_pixels; i++) {
         std::memcpy(dest, src, kPhwc4ChannelsInPlane * sizeof(float));
         src += kPhwc4ChannelsInPlane;
         dest += shape.c;
       }
     }
   }

   // Unpadding last kPhwc4ChannelsInPlane-channel plane
   const int remaining_channels =
       shape.c - num_full_planes * kPhwc4ChannelsInPlane;
   if (remaining_channels == 0) {
     return OkStatus();
   }
   for (int b = 0; b < shape.b; b++) {
     const float* src = in.data() + b * padded_size +
                        num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
     float* dest =
         out.data() +
         shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
     for (int p = 0; p < num_pixels; p++) {
       std::memcpy(dest, src, remaining_channels * sizeof(float));
       src += kPhwc4ChannelsInPlane;
       dest += shape.c;
     }
   }
   return OkStatus();
 }

 Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in, const BHWC& shape,
                             absl::Span<float> out) {
   RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
   int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
   const int num_pixels = shape.h * shape.w;
   const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
   // A layer is a set of kPhwc4ChannelsInPlane channels images.
   const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
   for (int b = 0; b < shape.b; b++) {
     const HalfBits* src = in.data() + b * padded_size;
     for (int p = 0; p < num_full_planes; p++) {
       float* dest =
           out.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
       for (int i = 0; i < num_pixels; i++) {
         dest[0] = fp16_ieee_to_fp32_value(src[0]);
         dest[1] = fp16_ieee_to_fp32_value(src[1]);
         dest[2] = fp16_ieee_to_fp32_value(src[2]);
         dest[3] = fp16_ieee_to_fp32_value(src[3]);
         src += kPhwc4ChannelsInPlane;
         dest += shape.c;
       }
     }
   }

   // Unpadding last kPhwc4ChannelsInPlane-channel plane
   const int remaining_channels =
       shape.c - num_full_planes * kPhwc4ChannelsInPlane;
   if (remaining_channels == 0) {
     return OkStatus();
   }
   for (int b = 0; b < shape.b; b++) {
     const HalfBits* src = in.data() + b * padded_size +
                           num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
     float* dest =
         out.data() +
         shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
     switch (remaining_channels) {
       case 1:
         for (int p = 0; p < num_pixels; p++) {
           dest[0] = fp16_ieee_to_fp32_value(src[0]);
           src += kPhwc4ChannelsInPlane;
           dest += shape.c;
         }
         break;
       case 2:
         for (int p = 0; p < num_pixels; p++) {
           dest[0] = fp16_ieee_to_fp32_value(src[0]);
           dest[1] = fp16_ieee_to_fp32_value(src[1]);
           src += kPhwc4ChannelsInPlane;
           dest += shape.c;
         }
         break;
       case 3:
         for (int p = 0; p < num_pixels; p++) {
           dest[0] = fp16_ieee_to_fp32_value(src[0]);
           dest[1] = fp16_ieee_to_fp32_value(src[1]);
           dest[2] = fp16_ieee_to_fp32_value(src[2]);
           src += kPhwc4ChannelsInPlane;
           dest += shape.c;
         }
         break;
       default:
         return UnimplementedError(
             "ConvertToPHWC4Half: Unsupported channels per planes count.");
     }
   }
   return OkStatus();
 }

 }  // namespace gpu
 }  // namespace tflite
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/lite/delegates/gpu/common/convert.h"

	#include <fp16.h>
	#include "absl/strings/str_cat.h"
	#include "tensorflow/lite/delegates/gpu/common/status.h"
	#include "tensorflow/lite/delegates/gpu/common/types.h"
	#include "tensorflow/lite/delegates/gpu/common/util.h"

	namespace tflite {
	namespace gpu {
	namespace {

	constexpr int kPhwc4ChannelsInPlane = 4;
	constexpr int kPhwo4i4ChannelsInPlane = 4;
	constexpr int kPiohw4ChannelsInPlane = 4;

	// Layout is Po,H,W,OI4x4.
	Status ConvertToPHWO4I4(absl::Span<const float> in, const OHWI& shape,
	absl::Span<float> out, bool reverse_space) {
	if (in.size() != shape.DimensionsProduct()) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPHWO4I4: Input data size does not match expected size: ",
	in.size(), " != ", shape.DimensionsProduct()));
	}
	if (out.size() != GetElementsSizeForPHWO4I4(shape)) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPHWO4I4: Output data size does not match expected size: ",
	out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
	}

	float* output = out.data();
	for (int p = 0; p < IntegralDivideRoundUp(shape.o, kPhwo4i4ChannelsInPlane);
	++p) {
	for (int h = 0; h < shape.h; ++h) {
	for (int w = 0; w < shape.w; ++w) {
	for (int c = 0;
	c < IntegralDivideRoundUp(shape.i, kPhwo4i4ChannelsInPlane); ++c) {
	for (int co = 0; co < kPhwo4i4ChannelsInPlane; ++co) {
	for (int ci = 0; ci < kPhwo4i4ChannelsInPlane; ++ci) {
	float value = 0;
	if (c * kPhwo4i4ChannelsInPlane + ci < shape.i &&
	p * kPhwo4i4ChannelsInPlane + co < shape.o) {
	// tensor is in OHWI
	int tensor_o = p * kPhwo4i4ChannelsInPlane + co;
	int tensor_i = c * kPhwo4i4ChannelsInPlane + ci;
	const int in_h = reverse_space ? shape.h - 1 - h : h;
	const int in_w = reverse_space ? shape.w - 1 - w : w;
	value = in[shape.LinearIndex({tensor_o, in_h, in_w, tensor_i})];
	}
	(*output++) = value;
	}
	}
	}
	}
	}
	}
	return OkStatus();
	}

	} // namespace

	uint32_t GetElementsSizeForPHWO4I4(const OHWI& shape) {
	return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
	AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
	}

	uint32_t GetElementsSizeForPHWO4I4(const IHWO& shape) {
	return AlignByN(shape.i, kPhwo4i4ChannelsInPlane) *
	AlignByN(shape.o, kPhwo4i4ChannelsInPlane) * shape.h * shape.w;
	}

	std::vector<float> ConvertToPHWO4I4(
	const Tensor<OHWI, DataType::FLOAT32>& tensor) {
	std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
	ConvertToPHWO4I4(tensor.data, tensor.shape,
	absl::MakeSpan(transposed.data(), transposed.size()),
	/reverse_space=/false)
	.IgnoreError();
	return transposed;
	}

	std::vector<float> ConvertToPHWO4I4Transposed(
	const Tensor<OHWI, DataType::FLOAT32>& tensor) {
	std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
	ConvertToPHWO4I4(tensor.data, tensor.shape,
	absl::MakeSpan(transposed.data(), transposed.size()),
	/reverse_space=/true)
	.IgnoreError();
	return transposed;
	}

	uint3 Get3DSizeForPHWO4I4(const OHWI& shape) {
	return uint3(AlignByN(shape.i, 4), shape.h * shape.w,
	IntegralDivideRoundUp(shape.o, 4));
	}

	// Layout is Po,H,W,OI4x4.
	Status ConvertToPHWO4I4(absl::Span<const float> in, const IHWO& shape,
	absl::Span<float> out) {
	if (in.size() != shape.DimensionsProduct()) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPHWO4I4: Input data size does not match expected size: ",
	in.size(), " != ", shape.DimensionsProduct()));
	}
	if (out.size() != GetElementsSizeForPHWO4I4(shape)) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPHWO4I4: Output data size does not match expected size: ",
	out.size(), " != ", GetElementsSizeForPHWO4I4(shape)));
	}

	const int dst_depth = IntegralDivideRoundUp(shape.o, 4);
	const int src_depth = IntegralDivideRoundUp(shape.i, 4);

	float* output = out.data();
	for (int f = 0; f < dst_depth; ++f) {
	for (int y = 0; y < shape.h; ++y) {
	for (int x = 0; x < shape.w; ++x) {
	for (int ch = 0; ch < src_depth; ++ch) {
	for (int co = 0; co < 4; ++co) {
	for (int ci = 0; ci < 4; ++ci) {
	const int src_channel = ch * 4 + ci;
	const int dst_channel = f * 4 + co;
	float value = 0;
	if (src_channel < shape.i && dst_channel < shape.o) {
	// tensor is in IHWO
	value = in[shape.LinearIndex({src_channel, y, x, dst_channel})];
	}
	(*output++) = value;
	}
	}
	}
	}
	}
	}
	return OkStatus();
	}

	std::vector<float> ConvertToPHWO4I4(
	const Tensor<IHWO, DataType::FLOAT32>& tensor) {
	std::vector<float> transposed(GetElementsSizeForPHWO4I4(tensor.shape));
	ConvertToPHWO4I4(tensor.data, tensor.shape,
	absl::MakeSpan(transposed.data(), transposed.size()))
	.IgnoreError();
	return transposed;
	}

	uint32_t GetElementsSizeForPIOHW4(const OHWI& shape) {
	return AlignByN(shape.o * shape.i, kPiohw4ChannelsInPlane) * shape.h *
	shape.w;
	}

	Status ConvertToPIOHW4(absl::Span<const float> in, const OHWI& shape,
	absl::Span<float> out) {
	if (in.size() != shape.DimensionsProduct()) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPIOHW4: Input data size does not match expected size: ",
	in.size(), " != ", shape.DimensionsProduct()));
	}
	if (out.size() != GetElementsSizeForPIOHW4(shape)) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPIOHW4: Output data size does not match expected size: ",
	out.size(), " != ", GetElementsSizeForPIOHW4(shape)));
	}

	int32_t output_channels = shape.o * shape.i;
	int32_t num_planes =
	IntegralDivideRoundUp(output_channels, kPiohw4ChannelsInPlane);
	float* output = out.data();
	for (int p = 0; p < num_planes; ++p) {
	for (int h = 0; h < shape.h; ++h) {
	for (int w = 0; w < shape.w; ++w) {
	for (int c = 0; c < kPiohw4ChannelsInPlane; ++c) {
	int output_c = p * kPiohw4ChannelsInPlane + c;
	(*output++) = output_c >= output_channels
	? 0
	: in[shape.LinearIndex({output_c % shape.o, h, w,
	output_c / shape.o})];
	}
	}
	}
	}
	return OkStatus();
	}

	std::vector<float> ConvertToPIOHW4(
	const Tensor<OHWI, DataType::FLOAT32>& tensor) {
	std::vector<float> transposed(GetElementsSizeForPIOHW4(tensor.shape));
	ConvertToPIOHW4(tensor.data, tensor.shape,
	absl::MakeSpan(transposed.data(), transposed.size()))
	.IgnoreError();
	return transposed;
	}

	template <typename T>
	Status ValidateConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
	absl::Span<T> out) {
	if (in.size() != shape.DimensionsProduct()) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPHWC4: Input data size does not match expected size: ",
	in.size(), " != ", shape.DimensionsProduct()));
	}
	if (out.size() != GetElementsSizeForPHWC4(shape)) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertToPHWC4: Output data size does not match expected size: ",
	out.size(), " != ", GetElementsSizeForPHWC4(shape)));
	}
	return OkStatus();
	}

	// Layout is Pc,H,W,C4 where P - is a plane based on channels.
	Status ConvertToPHWC4(absl::Span<const float> in, const BHWC& shape,
	absl::Span<float> out) {
	RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));
	if (shape.c == 4) {
	std::memcpy(out.data(), in.data(),
	shape.DimensionsProduct() * sizeof(float));
	return OkStatus();
	}
	// Layout is Pc,H,W,C4 where P - is a plane based on channels.
	int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
	const int num_pixels = shape.h * shape.w;
	// A layer is a set of kPhwc4ChannelsInPlane channels images.
	const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
	for (int b = 0; b < shape.b; b++) {
	float* dest =
	out.data() + b * num_pixels * num_planes * kPhwc4ChannelsInPlane;
	for (int p = 0; p < num_full_planes; p++) {
	const float* src =
	in.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
	for (int i = 0; i < num_pixels; i++) {
	std::memcpy(dest, src, kPhwc4ChannelsInPlane * sizeof(float));
	src += shape.c;
	dest += kPhwc4ChannelsInPlane;
	}
	}
	}

	// Padding last kPhwc4ChannelsInPlane-channel layer to multiple of
	// kPhwc4ChannelsInPlane.
	const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
	const int remaining_channels =
	shape.c - num_full_planes * kPhwc4ChannelsInPlane;
	if (remaining_channels == 0) {
	return OkStatus();
	}
	for (int b = 0; b < shape.b; b++) {
	const float* src =
	in.data() +
	shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
	float* dest = out.data() + b * padded_size +
	num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
	for (int p = 0; p < num_pixels; p++) {
	std::memcpy(dest, src, remaining_channels * sizeof(float));
	std::memset(dest + remaining_channels, 0,
	(4 - remaining_channels) * sizeof(float));
	src += shape.c;
	dest += kPhwc4ChannelsInPlane;
	}
	}
	return OkStatus();
	}

	// Layout is Pc,H,W,C4 where P - is a plane based on channels.
	Status ConvertToPHWC4Half(absl::Span<const float> in, const BHWC& shape,
	absl::Span<HalfBits> out) {
	RETURN_IF_ERROR(ValidateConvertToPHWC4(in, shape, out));

	// Layout is Pc,H,W,C4 where P - is a plane based on channels.
	int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
	const int num_pixels = shape.h * shape.w;
	// A layer is a set of kPhwc4ChannelsInPlane channels images.
	const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
	for (int b = 0; b < shape.b; b++) {
	HalfBits* dest =
	out.data() + b * num_pixels * num_planes * kPhwc4ChannelsInPlane;
	for (int p = 0; p < num_full_planes; p++) {
	const float* src =
	in.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
	for (int i = 0; i < num_pixels; i++) {
	dest[0] = fp16_ieee_from_fp32_value(src[0]);
	dest[1] = fp16_ieee_from_fp32_value(src[1]);
	dest[2] = fp16_ieee_from_fp32_value(src[2]);
	dest[3] = fp16_ieee_from_fp32_value(src[3]);
	src += shape.c;
	dest += kPhwc4ChannelsInPlane;
	}
	}
	}

	// Padding last kPhwc4ChannelsInPlane-channel layer to multiple of
	// kPhwc4ChannelsInPlane.
	const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
	const int remaining_channels =
	shape.c - num_full_planes * kPhwc4ChannelsInPlane;
	if (remaining_channels == 0) {
	return OkStatus();
	}

	for (int b = 0; b < shape.b; b++) {
	const float* src =
	in.data() +
	shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
	HalfBits* dest = out.data() + b * padded_size +
	num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
	switch (remaining_channels) {
	case 1:
	for (int p = 0; p < num_pixels; p++) {
	dest[0] = fp16_ieee_from_fp32_value(src[0]);
	dest[1] = 0;
	dest[2] = 0;
	dest[3] = 0;
	src += shape.c;
	dest += kPhwc4ChannelsInPlane;
	}
	break;
	case 2:
	for (int p = 0; p < num_pixels; p++) {
	dest[0] = fp16_ieee_from_fp32_value(src[0]);
	dest[1] = fp16_ieee_from_fp32_value(src[1]);
	dest[2] = 0;
	dest[3] = 0;
	src += shape.c;
	dest += kPhwc4ChannelsInPlane;
	}
	break;
	case 3:
	for (int p = 0; p < num_pixels; p++) {
	dest[0] = fp16_ieee_from_fp32_value(src[0]);
	dest[1] = fp16_ieee_from_fp32_value(src[1]);
	dest[2] = fp16_ieee_from_fp32_value(src[2]);
	dest[3] = 0;
	src += shape.c;
	dest += kPhwc4ChannelsInPlane;
	}
	break;
	default:
	return UnimplementedError(
	"ConvertToPHWC4Half: Unsupported channels per planes count.");
	}
	}
	return OkStatus();
	}

	std::vector<float> ConvertToPHWC4(
	const Tensor<BHWC, DataType::FLOAT32>& tensor) {
	std::vector<float> transposed(GetElementsSizeForPHWC4(tensor.shape));
	ConvertToPHWC4(tensor.data, tensor.shape,
	absl::MakeSpan(transposed.data(), transposed.size()))
	.IgnoreError();
	// TODO(akulik): Maybe safer to return Status.
	return transposed;
	}

	std::vector<float> ConvertToPHWC4(
	const Tensor<HWC, DataType::FLOAT32>& tensor) {
	const BHWC batched_shape =
	BHWC(1, tensor.shape.h, tensor.shape.w, tensor.shape.c);
	std::vector<float> transposed(GetElementsSizeForPHWC4(batched_shape));
	ConvertToPHWC4(tensor.data, batched_shape,
	absl::MakeSpan(transposed.data(), transposed.size()))
	.IgnoreError();
	// TODO(akulik): Maybe safer to return Status.
	return transposed;
	}

	uint32_t GetElementsSizeForPHWC4(const BHWC& shape) {
	return shape.b * shape.h * shape.w * AlignByN(shape.c, kPhwc4ChannelsInPlane);
	}

	template <typename T>
	Status ValidateConvertFromPHWC4(absl::Span<const T> in, const BHWC& shape,
	absl::Span<float> out) {
	if (in.size() != GetElementsSizeForPHWC4(shape)) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertFromPHWC4: Input data size does not match expected size: ",
	in.size(), " != ", GetElementsSizeForPHWC4(shape)));
	}
	if (out.size() != shape.DimensionsProduct()) {
	return InvalidArgumentError(absl::StrCat(
	"ConvertFromPHWC4: Output data size does not match expected size: ",
	out.size(), " != ", shape.DimensionsProduct()));
	}
	return OkStatus();
	}

	Status ConvertFromPHWC4(absl::Span<const float> in, const BHWC& shape,
	absl::Span<float> out) {
	RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
	if (shape.c == 4) {
	std::memcpy(out.data(), in.data(),
	shape.DimensionsProduct() * sizeof(float));
	return OkStatus();
	}

	int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
	const int num_pixels = shape.h * shape.w;
	const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
	// A layer is a set of kPhwc4ChannelsInPlane channels images.
	const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
	for (int b = 0; b < shape.b; b++) {
	const float* src = in.data() + b * padded_size;
	for (int p = 0; p < num_full_planes; p++) {
	float* dest =
	out.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
	for (int i = 0; i < num_pixels; i++) {
	std::memcpy(dest, src, kPhwc4ChannelsInPlane * sizeof(float));
	src += kPhwc4ChannelsInPlane;
	dest += shape.c;
	}
	}
	}

	// Unpadding last kPhwc4ChannelsInPlane-channel plane
	const int remaining_channels =
	shape.c - num_full_planes * kPhwc4ChannelsInPlane;
	if (remaining_channels == 0) {
	return OkStatus();
	}
	for (int b = 0; b < shape.b; b++) {
	const float* src = in.data() + b * padded_size +
	num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
	float* dest =
	out.data() +
	shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
	for (int p = 0; p < num_pixels; p++) {
	std::memcpy(dest, src, remaining_channels * sizeof(float));
	src += kPhwc4ChannelsInPlane;
	dest += shape.c;
	}
	}
	return OkStatus();
	}

	Status ConvertFromPHWC4Half(absl::Span<const HalfBits> in, const BHWC& shape,
	absl::Span<float> out) {
	RETURN_IF_ERROR(ValidateConvertFromPHWC4(in, shape, out));
	int num_planes = IntegralDivideRoundUp(shape.c, kPhwc4ChannelsInPlane);
	const int num_pixels = shape.h * shape.w;
	const int padded_size = num_pixels * num_planes * kPhwc4ChannelsInPlane;
	// A layer is a set of kPhwc4ChannelsInPlane channels images.
	const int num_full_planes = shape.c / kPhwc4ChannelsInPlane;
	for (int b = 0; b < shape.b; b++) {
	const HalfBits* src = in.data() + b * padded_size;
	for (int p = 0; p < num_full_planes; p++) {
	float* dest =
	out.data() + shape.LinearIndex({b, 0, 0, p * kPhwc4ChannelsInPlane});
	for (int i = 0; i < num_pixels; i++) {
	dest[0] = fp16_ieee_to_fp32_value(src[0]);
	dest[1] = fp16_ieee_to_fp32_value(src[1]);
	dest[2] = fp16_ieee_to_fp32_value(src[2]);
	dest[3] = fp16_ieee_to_fp32_value(src[3]);
	src += kPhwc4ChannelsInPlane;
	dest += shape.c;
	}
	}
	}

	// Unpadding last kPhwc4ChannelsInPlane-channel plane
	const int remaining_channels =
	shape.c - num_full_planes * kPhwc4ChannelsInPlane;
	if (remaining_channels == 0) {
	return OkStatus();
	}
	for (int b = 0; b < shape.b; b++) {
	const HalfBits* src = in.data() + b * padded_size +
	num_pixels * num_full_planes * kPhwc4ChannelsInPlane;
	float* dest =
	out.data() +
	shape.LinearIndex({b, 0, 0, num_full_planes * kPhwc4ChannelsInPlane});
	switch (remaining_channels) {
	case 1:
	for (int p = 0; p < num_pixels; p++) {
	dest[0] = fp16_ieee_to_fp32_value(src[0]);
	src += kPhwc4ChannelsInPlane;
	dest += shape.c;
	}
	break;
	case 2:
	for (int p = 0; p < num_pixels; p++) {
	dest[0] = fp16_ieee_to_fp32_value(src[0]);
	dest[1] = fp16_ieee_to_fp32_value(src[1]);
	src += kPhwc4ChannelsInPlane;
	dest += shape.c;
	}
	break;
	case 3:
	for (int p = 0; p < num_pixels; p++) {
	dest[0] = fp16_ieee_to_fp32_value(src[0]);
	dest[1] = fp16_ieee_to_fp32_value(src[1]);
	dest[2] = fp16_ieee_to_fp32_value(src[2]);
	src += kPhwc4ChannelsInPlane;
	dest += shape.c;
	}
	break;
	default:
	return UnimplementedError(
	"ConvertToPHWC4Half: Unsupported channels per planes count.");
	}
	}
	return OkStatus();
	}

	} // namespace gpu
	} // namespace tflite