boards/evkmimx8mq/demo_apps/hello_world_tflite/external/tensorflow/lite/kernels/internal/reference/add.h - mcuxpresso_sdk - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_

 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"

 namespace tflite {

 namespace reference_ops {

 template <typename T>
 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const T* input1_data,
                 const RuntimeShape& input2_shape, const T* input2_data,
                 const RuntimeShape& output_shape, T* output_data) {
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
         input1_data[i] + input2_data[i], params.quantized_activation_min,
         params.quantized_activation_max);
   }
 }

 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const float* input1_data,
                 const RuntimeShape& input2_shape, const float* input2_data,
                 const RuntimeShape& output_shape, float* output_data) {
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; i++) {
     auto x = input1_data[i] + input2_data[i];
     output_data[i] = ActivationFunctionWithMinMax(
         x, params.float_activation_min, params.float_activation_max);
   }
 }

 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
                            const uint8* input1_data, const uint8* input2_data,
                            uint8* output_data) {
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);

   for (int i = 0; i < size; ++i) {
     const int32 input1_val = params.input1_offset + input1_data[i];
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
     const int32 scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input1_val, params.input1_multiplier, params.input1_shift);
     const int32 scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
     const int32 raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sum, params.output_multiplier, params.output_shift) +
         params.output_offset;
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
     output_data[i] = static_cast<uint8>(clamped_output);
   }
 }

 // Scalar-broadcast add that can be used for inner loop of more general
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
 inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
                                uint8 input1_data, const uint8* input2_data,
                                uint8* output_data) {
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);

   const int32 input1_val = params.input1_offset + input1_data;
   const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
   const int32 scaled_input1_val =
       MultiplyByQuantizedMultiplierSmallerThanOneExp(
           shifted_input1_val, params.input1_multiplier, params.input1_shift);
   for (int i = 0; i < size; ++i) {
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
     const int32 scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             shifted_input2_val, params.input2_multiplier, params.input2_shift);
     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
     const int32 raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
             raw_sum, params.output_multiplier, params.output_shift) +
         params.output_offset;
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
     output_data[i] = static_cast<uint8>(clamped_output);
   }
 }

 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const uint8* input1_data,
                 const RuntimeShape& input2_shape, const uint8* input2_data,
                 const RuntimeShape& output_shape, uint8* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);

   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }

 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const int16* input1_data,
                 const RuntimeShape& input2_shape, const int16* input2_data,
                 const RuntimeShape& output_shape, int16* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);

   const int input1_shift = params.input1_shift;
   const int flat_size =
       MatchingElementsSize(input1_shape, input2_shape, output_shape);
   const int16 output_activation_min = params.quantized_activation_min;
   const int16 output_activation_max = params.quantized_activation_max;

   TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
   TFLITE_DCHECK_LE(input1_shift, 0);
   TFLITE_DCHECK_LE(params.input2_shift, 0);
   const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
   const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
   const int input_right_shift =
       input1_shift == 0 ? -params.input2_shift : -input1_shift;

   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;

     F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
     F0 scaled_input = F0::FromRaw(
         gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
     const int16 raw_output = result.raw();
     const int16 clamped_output = std::min(
         output_activation_max, std::max(output_activation_min, raw_output));
     output_data[i] = clamped_output;
   }
 }

 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
 // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
 // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
 // is no longer referenced in this file, move NdArrayDesc<T> from types.h to
 // reference_ops.h.
 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
                                const float* input1_data,
                                const RuntimeShape& input2_shape,
                                const float* input2_data,
                                const RuntimeShape& output_shape,
                                float* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
       RuntimeShape::ExtendedShape(4, output_shape);

   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
   // trailing dimension changing most rapidly (channels has the smallest stride,
   // typically 1 element).
   //
   // In generated C code, we store arrays with the dimensions reversed. The
   // first dimension has smallest stride.
   //
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
           output_data[Offset(extended_output_shape, b, y, x, c)] =
               ActivationFunctionWithMinMax(
                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
                   params.float_activation_min, params.float_activation_max);
         }
       }
     }
   }
 }

 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
                                const int32* input1_data,
                                const RuntimeShape& input2_shape,
                                const int32* input2_data,
                                const RuntimeShape& output_shape,
                                int32* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
       RuntimeShape::ExtendedShape(4, output_shape);

   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
   // trailing dimension changing most rapidly (channels has the smallest stride,
   // typically 1 element).
   //
   // In generated C code, we store arrays with the dimensions reversed. The
   // first dimension has smallest stride.
   //
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
           output_data[Offset(extended_output_shape, b, y, x, c)] =
               ActivationFunctionWithMinMax(
                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
                   params.quantized_activation_min,
                   params.quantized_activation_max);
         }
       }
     }
   }
 }

 inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
                                const RuntimeShape& input1_shape,
                                const uint8* input1_data,
                                const RuntimeShape& input2_shape,
                                const uint8* input2_data,
                                const RuntimeShape& output_shape,
                                uint8* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
                                       &desc2);
   const RuntimeShape extended_output_shape =
       RuntimeShape::ExtendedShape(4, output_shape);

   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
   // trailing dimension changing most rapidly (channels has the smallest stride,
   // typically 1 element).
   //
   // In generated C code, we store arrays with the dimensions reversed. The
   // first dimension has smallest stride.
   //
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
           const int32 input1_val =
               params.input1_offset +
               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
           const int32 input2_val =
               params.input2_offset +
               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
           const int32 shifted_input1_val =
               input1_val * (1 << params.left_shift);
           const int32 shifted_input2_val =
               input2_val * (1 << params.left_shift);
           const int32 scaled_input1_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input1_val, params.input1_multiplier,
                   params.input1_shift);
           const int32 scaled_input2_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   shifted_input2_val, params.input2_multiplier,
                   params.input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
                   raw_sum, params.output_multiplier, params.output_shift) +
               params.output_offset;
           const int32 clamped_output =
               std::min(params.quantized_activation_max,
                        std::max(params.quantized_activation_min, raw_output));
           output_data[Offset(extended_output_shape, b, y, x, c)] =
               static_cast<uint8>(clamped_output);
         }
       }
     }
   }
 }

 inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
                                  const RuntimeShape& unswitched_input1_shape,
                                  const uint8* unswitched_input1_data,
                                  const RuntimeShape& unswitched_input2_shape,
                                  const uint8* unswitched_input2_data,
                                  const RuntimeShape& output_shape,
                                  uint8* output_data) {
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
   switched_params.input1_shift = unswitched_params.input2_shift;
   switched_params.input2_offset = unswitched_params.input1_offset;
   switched_params.input2_multiplier = unswitched_params.input1_multiplier;
   switched_params.input2_shift = unswitched_params.input1_shift;

   const bool use_unswitched =
       unswitched_params.broadcast_category ==
       tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;

   const ArithmeticParams& params =
       use_unswitched ? unswitched_params : switched_params;
   const uint8* input1_data =
       use_unswitched ? unswitched_input1_data : unswitched_input2_data;
   const uint8* input2_data =
       use_unswitched ? unswitched_input2_data : unswitched_input1_data;

   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
   // beginning of the fourth loop. The innermost loop is an elementwise add of
   // sections of the arrays.
   uint8* output_data_ptr = output_data;
   const uint8* input1_data_ptr = input1_data;
   const uint8* input2_data_reset = input2_data;
   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
   // between input shapes. y3 for input 1 is always broadcast, and so the
   // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
   // Put another way,
   // input1.shape.FlatSize = y0 * y1 * y2 * y4,
   // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
   int y2 = params.broadcast_shape[2];
   int y3 = params.broadcast_shape[3];
   int y4 = params.broadcast_shape[4];
   if (y4 > 1) {
     // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
     // dimension.
     for (int i0 = 0; i0 < y0; ++i0) {
       const uint8* input2_data_ptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
           for (int i3 = 0; i3 < y3; ++i3) {
             AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
                            output_data_ptr);
             input2_data_ptr += y4;
             output_data_ptr += y4;
           }
           // We have broadcast y4 of input1 data y3 times, and now move on.
           input1_data_ptr += y4;
         }
       }
       // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
       input2_data_reset = input2_data_ptr;
     }
   } else {
     // Special case of y4 == 1, in which the innermost loop is a single element
     // and can be combined with the next (y3) as an inner broadcast.
     //
     // Note that this handles the case of pure scalar broadcast when
     // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
     // broadcast with batch (as y2 > 1).
     //
     // NOTE The process is the same as the above general case except simplified
     // for y4 == 1 and the loop over y3 is contained within the
     // AddScalarBroadcast function.
     for (int i0 = 0; i0 < y0; ++i0) {
       const uint8* input2_data_ptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
           AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
                              output_data_ptr);
           input2_data_ptr += y3;
           output_data_ptr += y3;
           input1_data_ptr += 1;
         }
       }
       input2_data_reset = input2_data_ptr;
     }
   }
 }

 }  // namespace reference_ops
 }  // namespace tflite

 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/
	#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_
	#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_

	#include "fixedpoint/fixedpoint.h"
	#include "tensorflow/lite/kernels/internal/common.h"

	namespace tflite {

	namespace reference_ops {

	template <typename T>
	inline void Add(const ArithmeticParams& params,
	const RuntimeShape& input1_shape, const T* input1_data,
	const RuntimeShape& input2_shape, const T* input2_data,
	const RuntimeShape& output_shape, T* output_data) {
	const int flat_size =
	MatchingElementsSize(input1_shape, input2_shape, output_shape);
	for (int i = 0; i < flat_size; ++i) {
	output_data[i] = ActivationFunctionWithMinMax(
	input1_data[i] + input2_data[i], params.quantized_activation_min,
	params.quantized_activation_max);
	}
	}

	inline void Add(const ArithmeticParams& params,
	const RuntimeShape& input1_shape, const float* input1_data,
	const RuntimeShape& input2_shape, const float* input2_data,
	const RuntimeShape& output_shape, float* output_data) {
	const int flat_size =
	MatchingElementsSize(input1_shape, input2_shape, output_shape);
	for (int i = 0; i < flat_size; i++) {
	auto x = input1_data[i] + input2_data[i];
	output_data[i] = ActivationFunctionWithMinMax(
	x, params.float_activation_min, params.float_activation_max);
	}
	}

	// Element-wise add that can often be used for inner loop of broadcast add as
	// well as the non-broadcast add.
	inline void AddElementwise(int size, const ArithmeticParams& params,
	const uint8* input1_data, const uint8* input2_data,
	uint8* output_data) {
	TFLITE_DCHECK_GT(params.input1_offset, -256);
	TFLITE_DCHECK_GT(params.input2_offset, -256);
	TFLITE_DCHECK_LT(params.input1_offset, 256);
	TFLITE_DCHECK_LT(params.input2_offset, 256);

	for (int i = 0; i < size; ++i) {
	const int32 input1_val = params.input1_offset + input1_data[i];
	const int32 input2_val = params.input2_offset + input2_data[i];
	const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
	const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
	const int32 scaled_input1_val =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	shifted_input1_val, params.input1_multiplier, params.input1_shift);
	const int32 scaled_input2_val =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	shifted_input2_val, params.input2_multiplier, params.input2_shift);
	const int32 raw_sum = scaled_input1_val + scaled_input2_val;
	const int32 raw_output =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	raw_sum, params.output_multiplier, params.output_shift) +
	params.output_offset;
	const int32 clamped_output =
	std::min(params.quantized_activation_max,
	std::max(params.quantized_activation_min, raw_output));
	output_data[i] = static_cast<uint8>(clamped_output);
	}
	}

	// Scalar-broadcast add that can be used for inner loop of more general
	// broadcast add, so that, for example, scalar-broadcast with batch will still
	// be fast.
	inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
	uint8 input1_data, const uint8* input2_data,
	uint8* output_data) {
	TFLITE_DCHECK_GT(params.input1_offset, -256);
	TFLITE_DCHECK_GT(params.input2_offset, -256);
	TFLITE_DCHECK_LT(params.input1_offset, 256);
	TFLITE_DCHECK_LT(params.input2_offset, 256);

	const int32 input1_val = params.input1_offset + input1_data;
	const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
	const int32 scaled_input1_val =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	shifted_input1_val, params.input1_multiplier, params.input1_shift);
	for (int i = 0; i < size; ++i) {
	const int32 input2_val = params.input2_offset + input2_data[i];
	const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
	const int32 scaled_input2_val =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	shifted_input2_val, params.input2_multiplier, params.input2_shift);
	const int32 raw_sum = scaled_input1_val + scaled_input2_val;
	const int32 raw_output =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	raw_sum, params.output_multiplier, params.output_shift) +
	params.output_offset;
	const int32 clamped_output =
	std::min(params.quantized_activation_max,
	std::max(params.quantized_activation_min, raw_output));
	output_data[i] = static_cast<uint8>(clamped_output);
	}
	}

	inline void Add(const ArithmeticParams& params,
	const RuntimeShape& input1_shape, const uint8* input1_data,
	const RuntimeShape& input2_shape, const uint8* input2_data,
	const RuntimeShape& output_shape, uint8* output_data) {
	TFLITE_DCHECK_LE(params.quantized_activation_min,
	params.quantized_activation_max);
	const int flat_size =
	MatchingElementsSize(input1_shape, input2_shape, output_shape);

	TFLITE_DCHECK_GT(params.input1_offset, -256);
	TFLITE_DCHECK_GT(params.input2_offset, -256);
	TFLITE_DCHECK_LT(params.input1_offset, 256);
	TFLITE_DCHECK_LT(params.input2_offset, 256);
	AddElementwise(flat_size, params, input1_data, input2_data, output_data);
	}

	inline void Add(const ArithmeticParams& params,
	const RuntimeShape& input1_shape, const int16* input1_data,
	const RuntimeShape& input2_shape, const int16* input2_data,
	const RuntimeShape& output_shape, int16* output_data) {
	TFLITE_DCHECK_LE(params.quantized_activation_min,
	params.quantized_activation_max);

	const int input1_shift = params.input1_shift;
	const int flat_size =
	MatchingElementsSize(input1_shape, input2_shape, output_shape);
	const int16 output_activation_min = params.quantized_activation_min;
	const int16 output_activation_max = params.quantized_activation_max;

	TFLITE_DCHECK(input1_shift == 0 \|\| params.input2_shift == 0);
	TFLITE_DCHECK_LE(input1_shift, 0);
	TFLITE_DCHECK_LE(params.input2_shift, 0);
	const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
	const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
	const int input_right_shift =
	input1_shift == 0 ? -params.input2_shift : -input1_shift;

	for (int i = 0; i < flat_size; i++) {
	// F0 uses 0 integer bits, range [-1, 1].
	using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;

	F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
	F0 scaled_input = F0::FromRaw(
	gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
	F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
	const int16 raw_output = result.raw();
	const int16 clamped_output = std::min(
	output_activation_max, std::max(output_activation_min, raw_output));
	output_data[i] = clamped_output;
	}
	}

	// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
	// dimensionality if the runtime code does a single loop over one dimension
	// that handles broadcasting as the base case. The code generator would then
	// generate max(D1, D2) nested for loops.
	// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
	// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
	// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
	// reference_ops.h.
	inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
	const RuntimeShape& input1_shape,
	const float* input1_data,
	const RuntimeShape& input2_shape,
	const float* input2_data,
	const RuntimeShape& output_shape,
	float* output_data) {
	NdArrayDesc<4> desc1;
	NdArrayDesc<4> desc2;
	NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
	&desc2);
	const RuntimeShape extended_output_shape =
	RuntimeShape::ExtendedShape(4, output_shape);

	// In Tensorflow, the dimensions are canonically named (batch_number, row,
	// col, channel), with extents (batches, height, width, depth), with the
	// trailing dimension changing most rapidly (channels has the smallest stride,
	// typically 1 element).
	//
	// In generated C code, we store arrays with the dimensions reversed. The
	// first dimension has smallest stride.
	//
	// We name our variables by their Tensorflow convention, but generate C code
	// nesting loops such that the innermost loop has the smallest stride for the
	// best cache behavior.
	for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
	for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
	for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
	for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
	output_data[Offset(extended_output_shape, b, y, x, c)] =
	ActivationFunctionWithMinMax(
	input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
	input2_data[SubscriptToIndex(desc2, b, y, x, c)],
	params.float_activation_min, params.float_activation_max);
	}
	}
	}
	}
	}

	inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
	const RuntimeShape& input1_shape,
	const int32* input1_data,
	const RuntimeShape& input2_shape,
	const int32* input2_data,
	const RuntimeShape& output_shape,
	int32* output_data) {
	NdArrayDesc<4> desc1;
	NdArrayDesc<4> desc2;
	NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
	&desc2);
	const RuntimeShape extended_output_shape =
	RuntimeShape::ExtendedShape(4, output_shape);

	// In Tensorflow, the dimensions are canonically named (batch_number, row,
	// col, channel), with extents (batches, height, width, depth), with the
	// trailing dimension changing most rapidly (channels has the smallest stride,
	// typically 1 element).
	//
	// In generated C code, we store arrays with the dimensions reversed. The
	// first dimension has smallest stride.
	//
	// We name our variables by their Tensorflow convention, but generate C code
	// nesting loops such that the innermost loop has the smallest stride for the
	// best cache behavior.
	for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
	for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
	for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
	for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
	output_data[Offset(extended_output_shape, b, y, x, c)] =
	ActivationFunctionWithMinMax(
	input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
	input2_data[SubscriptToIndex(desc2, b, y, x, c)],
	params.quantized_activation_min,
	params.quantized_activation_max);
	}
	}
	}
	}
	}

	inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
	const RuntimeShape& input1_shape,
	const uint8* input1_data,
	const RuntimeShape& input2_shape,
	const uint8* input2_data,
	const RuntimeShape& output_shape,
	uint8* output_data) {
	NdArrayDesc<4> desc1;
	NdArrayDesc<4> desc2;
	NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
	&desc2);
	const RuntimeShape extended_output_shape =
	RuntimeShape::ExtendedShape(4, output_shape);

	// In Tensorflow, the dimensions are canonically named (batch_number, row,
	// col, channel), with extents (batches, height, width, depth), with the
	// trailing dimension changing most rapidly (channels has the smallest stride,
	// typically 1 element).
	//
	// In generated C code, we store arrays with the dimensions reversed. The
	// first dimension has smallest stride.
	//
	// We name our variables by their Tensorflow convention, but generate C code
	// nesting loops such that the innermost loop has the smallest stride for the
	// best cache behavior.
	for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
	for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
	for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
	for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
	const int32 input1_val =
	params.input1_offset +
	input1_data[SubscriptToIndex(desc1, b, y, x, c)];
	const int32 input2_val =
	params.input2_offset +
	input2_data[SubscriptToIndex(desc2, b, y, x, c)];
	const int32 shifted_input1_val =
	input1_val * (1 << params.left_shift);
	const int32 shifted_input2_val =
	input2_val * (1 << params.left_shift);
	const int32 scaled_input1_val =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	shifted_input1_val, params.input1_multiplier,
	params.input1_shift);
	const int32 scaled_input2_val =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	shifted_input2_val, params.input2_multiplier,
	params.input2_shift);
	const int32 raw_sum = scaled_input1_val + scaled_input2_val;
	const int32 raw_output =
	MultiplyByQuantizedMultiplierSmallerThanOneExp(
	raw_sum, params.output_multiplier, params.output_shift) +
	params.output_offset;
	const int32 clamped_output =
	std::min(params.quantized_activation_max,
	std::max(params.quantized_activation_min, raw_output));
	output_data[Offset(extended_output_shape, b, y, x, c)] =
	static_cast<uint8>(clamped_output);
	}
	}
	}
	}
	}

	inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
	const RuntimeShape& unswitched_input1_shape,
	const uint8* unswitched_input1_data,
	const RuntimeShape& unswitched_input2_shape,
	const uint8* unswitched_input2_data,
	const RuntimeShape& output_shape,
	uint8* output_data) {
	ArithmeticParams switched_params = unswitched_params;
	switched_params.input1_offset = unswitched_params.input2_offset;
	switched_params.input1_multiplier = unswitched_params.input2_multiplier;
	switched_params.input1_shift = unswitched_params.input2_shift;
	switched_params.input2_offset = unswitched_params.input1_offset;
	switched_params.input2_multiplier = unswitched_params.input1_multiplier;
	switched_params.input2_shift = unswitched_params.input1_shift;

	const bool use_unswitched =
	unswitched_params.broadcast_category ==
	tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;

	const ArithmeticParams& params =
	use_unswitched ? unswitched_params : switched_params;
	const uint8* input1_data =
	use_unswitched ? unswitched_input1_data : unswitched_input2_data;
	const uint8* input2_data =
	use_unswitched ? unswitched_input2_data : unswitched_input1_data;

	// Fivefold nested loops. The second input resets its position for each
	// iteration of the second loop. The first input resets its position at the
	// beginning of the fourth loop. The innermost loop is an elementwise add of
	// sections of the arrays.
	uint8* output_data_ptr = output_data;
	const uint8* input1_data_ptr = input1_data;
	const uint8* input2_data_reset = input2_data;
	// In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
	// between input shapes. y3 for input 1 is always broadcast, and so the
	// dimension there is 1, whereas optionally y1 might be broadcast for input 2.
	// Put another way,
	// input1.shape.FlatSize = y0 * y1 * y2 * y4,
	// input2.shape.FlatSize = y0 * y2 * y3 * y4.
	int y0 = params.broadcast_shape[0];
	int y1 = params.broadcast_shape[1];
	int y2 = params.broadcast_shape[2];
	int y3 = params.broadcast_shape[3];
	int y4 = params.broadcast_shape[4];
	if (y4 > 1) {
	// General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
	// dimension.
	for (int i0 = 0; i0 < y0; ++i0) {
	const uint8* input2_data_ptr;
	for (int i1 = 0; i1 < y1; ++i1) {
	input2_data_ptr = input2_data_reset;
	for (int i2 = 0; i2 < y2; ++i2) {
	for (int i3 = 0; i3 < y3; ++i3) {
	AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
	output_data_ptr);
	input2_data_ptr += y4;
	output_data_ptr += y4;
	}
	// We have broadcast y4 of input1 data y3 times, and now move on.
	input1_data_ptr += y4;
	}
	}
	// We have broadcast y2y3y4 of input2 data y1 times, and now move on.
	input2_data_reset = input2_data_ptr;
	}
	} else {
	// Special case of y4 == 1, in which the innermost loop is a single element
	// and can be combined with the next (y3) as an inner broadcast.
	//
	// Note that this handles the case of pure scalar broadcast when
	// y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
	// broadcast with batch (as y2 > 1).
	//
	// NOTE The process is the same as the above general case except simplified
	// for y4 == 1 and the loop over y3 is contained within the
	// AddScalarBroadcast function.
	for (int i0 = 0; i0 < y0; ++i0) {
	const uint8* input2_data_ptr;
	for (int i1 = 0; i1 < y1; ++i1) {
	input2_data_ptr = input2_data_reset;
	for (int i2 = 0; i2 < y2; ++i2) {
	AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
	output_data_ptr);
	input2_data_ptr += y3;
	output_data_ptr += y3;
	input1_data_ptr += 1;
	}
	}
	input2_data_reset = input2_data_ptr;
	}
	}
	}

	} // namespace reference_ops
	} // namespace tflite

	#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_