| /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_ |
| #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_ |
| |
| #include "fixedpoint/fixedpoint.h" |
| #include "tensorflow/lite/kernels/internal/common.h" |
| |
| namespace tflite { |
| |
| namespace reference_ops { |
| |
| template <typename T> |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const T* input1_data, |
| const RuntimeShape& input2_shape, const T* input2_data, |
| const RuntimeShape& output_shape, T* output_data) { |
| const int flat_size = |
| MatchingElementsSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; ++i) { |
| output_data[i] = ActivationFunctionWithMinMax( |
| input1_data[i] + input2_data[i], params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const float* input1_data, |
| const RuntimeShape& input2_shape, const float* input2_data, |
| const RuntimeShape& output_shape, float* output_data) { |
| const int flat_size = |
| MatchingElementsSize(input1_shape, input2_shape, output_shape); |
| for (int i = 0; i < flat_size; i++) { |
| auto x = input1_data[i] + input2_data[i]; |
| output_data[i] = ActivationFunctionWithMinMax( |
| x, params.float_activation_min, params.float_activation_max); |
| } |
| } |
| |
| // Element-wise add that can often be used for inner loop of broadcast add as |
| // well as the non-broadcast add. |
| inline void AddElementwise(int size, const ArithmeticParams& params, |
| const uint8* input1_data, const uint8* input2_data, |
| uint8* output_data) { |
| TFLITE_DCHECK_GT(params.input1_offset, -256); |
| TFLITE_DCHECK_GT(params.input2_offset, -256); |
| TFLITE_DCHECK_LT(params.input1_offset, 256); |
| TFLITE_DCHECK_LT(params.input2_offset, 256); |
| |
| for (int i = 0; i < size; ++i) { |
| const int32 input1_val = params.input1_offset + input1_data[i]; |
| const int32 input2_val = params.input2_offset + input2_data[i]; |
| const int32 shifted_input1_val = input1_val * (1 << params.left_shift); |
| const int32 shifted_input2_val = input2_val * (1 << params.left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, params.input1_multiplier, params.input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, params.input2_multiplier, params.input2_shift); |
| const int32 raw_sum = scaled_input1_val + scaled_input2_val; |
| const int32 raw_output = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| raw_sum, params.output_multiplier, params.output_shift) + |
| params.output_offset; |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, raw_output)); |
| output_data[i] = static_cast<uint8>(clamped_output); |
| } |
| } |
| |
| // Scalar-broadcast add that can be used for inner loop of more general |
| // broadcast add, so that, for example, scalar-broadcast with batch will still |
| // be fast. |
| inline void AddScalarBroadcast(int size, const ArithmeticParams& params, |
| uint8 input1_data, const uint8* input2_data, |
| uint8* output_data) { |
| TFLITE_DCHECK_GT(params.input1_offset, -256); |
| TFLITE_DCHECK_GT(params.input2_offset, -256); |
| TFLITE_DCHECK_LT(params.input1_offset, 256); |
| TFLITE_DCHECK_LT(params.input2_offset, 256); |
| |
| const int32 input1_val = params.input1_offset + input1_data; |
| const int32 shifted_input1_val = input1_val * (1 << params.left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, params.input1_multiplier, params.input1_shift); |
| for (int i = 0; i < size; ++i) { |
| const int32 input2_val = params.input2_offset + input2_data[i]; |
| const int32 shifted_input2_val = input2_val * (1 << params.left_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, params.input2_multiplier, params.input2_shift); |
| const int32 raw_sum = scaled_input1_val + scaled_input2_val; |
| const int32 raw_output = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| raw_sum, params.output_multiplier, params.output_shift) + |
| params.output_offset; |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, raw_output)); |
| output_data[i] = static_cast<uint8>(clamped_output); |
| } |
| } |
| |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const uint8* input1_data, |
| const RuntimeShape& input2_shape, const uint8* input2_data, |
| const RuntimeShape& output_shape, uint8* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| const int flat_size = |
| MatchingElementsSize(input1_shape, input2_shape, output_shape); |
| |
| TFLITE_DCHECK_GT(params.input1_offset, -256); |
| TFLITE_DCHECK_GT(params.input2_offset, -256); |
| TFLITE_DCHECK_LT(params.input1_offset, 256); |
| TFLITE_DCHECK_LT(params.input2_offset, 256); |
| AddElementwise(flat_size, params, input1_data, input2_data, output_data); |
| } |
| |
| inline void Add(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, const int16* input1_data, |
| const RuntimeShape& input2_shape, const int16* input2_data, |
| const RuntimeShape& output_shape, int16* output_data) { |
| TFLITE_DCHECK_LE(params.quantized_activation_min, |
| params.quantized_activation_max); |
| |
| const int input1_shift = params.input1_shift; |
| const int flat_size = |
| MatchingElementsSize(input1_shape, input2_shape, output_shape); |
| const int16 output_activation_min = params.quantized_activation_min; |
| const int16 output_activation_max = params.quantized_activation_max; |
| |
| TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0); |
| TFLITE_DCHECK_LE(input1_shift, 0); |
| TFLITE_DCHECK_LE(params.input2_shift, 0); |
| const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data; |
| const int16* shift_input = input1_shift == 0 ? input2_data : input1_data; |
| const int input_right_shift = |
| input1_shift == 0 ? -params.input2_shift : -input1_shift; |
| |
| for (int i = 0; i < flat_size; i++) { |
| // F0 uses 0 integer bits, range [-1, 1]. |
| using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; |
| |
| F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]); |
| F0 scaled_input = F0::FromRaw( |
| gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift)); |
| F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled); |
| const int16 raw_output = result.raw(); |
| const int16 clamped_output = std::min( |
| output_activation_max, std::max(output_activation_min, raw_output)); |
| output_data[i] = clamped_output; |
| } |
| } |
| |
| // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary |
| // dimensionality if the runtime code does a single loop over one dimension |
| // that handles broadcasting as the base case. The code generator would then |
| // generate max(D1, D2) nested for loops. |
| // TODO(benoitjacob): BroadcastAdd is intentionally duplicated from |
| // reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> |
| // is no longer referenced in this file, move NdArrayDesc<T> from types.h to |
| // reference_ops.h. |
| inline void BroadcastAdd4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const float* input1_data, |
| const RuntimeShape& input2_shape, |
| const float* input2_data, |
| const RuntimeShape& output_shape, |
| float* output_data) { |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.float_activation_min, params.float_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastAdd4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const int32* input1_data, |
| const RuntimeShape& input2_shape, |
| const int32* input2_data, |
| const RuntimeShape& output_shape, |
| int32* output_data) { |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| ActivationFunctionWithMinMax( |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)] + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)], |
| params.quantized_activation_min, |
| params.quantized_activation_max); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastAdd4DSlow(const ArithmeticParams& params, |
| const RuntimeShape& input1_shape, |
| const uint8* input1_data, |
| const RuntimeShape& input2_shape, |
| const uint8* input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| NdArrayDesc<4> desc1; |
| NdArrayDesc<4> desc2; |
| NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, |
| &desc2); |
| const RuntimeShape extended_output_shape = |
| RuntimeShape::ExtendedShape(4, output_shape); |
| |
| // In Tensorflow, the dimensions are canonically named (batch_number, row, |
| // col, channel), with extents (batches, height, width, depth), with the |
| // trailing dimension changing most rapidly (channels has the smallest stride, |
| // typically 1 element). |
| // |
| // In generated C code, we store arrays with the dimensions reversed. The |
| // first dimension has smallest stride. |
| // |
| // We name our variables by their Tensorflow convention, but generate C code |
| // nesting loops such that the innermost loop has the smallest stride for the |
| // best cache behavior. |
| for (int b = 0; b < extended_output_shape.Dims(0); ++b) { |
| for (int y = 0; y < extended_output_shape.Dims(1); ++y) { |
| for (int x = 0; x < extended_output_shape.Dims(2); ++x) { |
| for (int c = 0; c < extended_output_shape.Dims(3); ++c) { |
| const int32 input1_val = |
| params.input1_offset + |
| input1_data[SubscriptToIndex(desc1, b, y, x, c)]; |
| const int32 input2_val = |
| params.input2_offset + |
| input2_data[SubscriptToIndex(desc2, b, y, x, c)]; |
| const int32 shifted_input1_val = |
| input1_val * (1 << params.left_shift); |
| const int32 shifted_input2_val = |
| input2_val * (1 << params.left_shift); |
| const int32 scaled_input1_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input1_val, params.input1_multiplier, |
| params.input1_shift); |
| const int32 scaled_input2_val = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| shifted_input2_val, params.input2_multiplier, |
| params.input2_shift); |
| const int32 raw_sum = scaled_input1_val + scaled_input2_val; |
| const int32 raw_output = |
| MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| raw_sum, params.output_multiplier, params.output_shift) + |
| params.output_offset; |
| const int32 clamped_output = |
| std::min(params.quantized_activation_max, |
| std::max(params.quantized_activation_min, raw_output)); |
| output_data[Offset(extended_output_shape, b, y, x, c)] = |
| static_cast<uint8>(clamped_output); |
| } |
| } |
| } |
| } |
| } |
| |
| inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params, |
| const RuntimeShape& unswitched_input1_shape, |
| const uint8* unswitched_input1_data, |
| const RuntimeShape& unswitched_input2_shape, |
| const uint8* unswitched_input2_data, |
| const RuntimeShape& output_shape, |
| uint8* output_data) { |
| ArithmeticParams switched_params = unswitched_params; |
| switched_params.input1_offset = unswitched_params.input2_offset; |
| switched_params.input1_multiplier = unswitched_params.input2_multiplier; |
| switched_params.input1_shift = unswitched_params.input2_shift; |
| switched_params.input2_offset = unswitched_params.input1_offset; |
| switched_params.input2_multiplier = unswitched_params.input1_multiplier; |
| switched_params.input2_shift = unswitched_params.input1_shift; |
| |
| const bool use_unswitched = |
| unswitched_params.broadcast_category == |
| tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast; |
| |
| const ArithmeticParams& params = |
| use_unswitched ? unswitched_params : switched_params; |
| const uint8* input1_data = |
| use_unswitched ? unswitched_input1_data : unswitched_input2_data; |
| const uint8* input2_data = |
| use_unswitched ? unswitched_input2_data : unswitched_input1_data; |
| |
| // Fivefold nested loops. The second input resets its position for each |
| // iteration of the second loop. The first input resets its position at the |
| // beginning of the fourth loop. The innermost loop is an elementwise add of |
| // sections of the arrays. |
| uint8* output_data_ptr = output_data; |
| const uint8* input1_data_ptr = input1_data; |
| const uint8* input2_data_reset = input2_data; |
| // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared |
| // between input shapes. y3 for input 1 is always broadcast, and so the |
| // dimension there is 1, whereas optionally y1 might be broadcast for input 2. |
| // Put another way, |
| // input1.shape.FlatSize = y0 * y1 * y2 * y4, |
| // input2.shape.FlatSize = y0 * y2 * y3 * y4. |
| int y0 = params.broadcast_shape[0]; |
| int y1 = params.broadcast_shape[1]; |
| int y2 = params.broadcast_shape[2]; |
| int y3 = params.broadcast_shape[3]; |
| int y4 = params.broadcast_shape[4]; |
| if (y4 > 1) { |
| // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner |
| // dimension. |
| for (int i0 = 0; i0 < y0; ++i0) { |
| const uint8* input2_data_ptr; |
| for (int i1 = 0; i1 < y1; ++i1) { |
| input2_data_ptr = input2_data_reset; |
| for (int i2 = 0; i2 < y2; ++i2) { |
| for (int i3 = 0; i3 < y3; ++i3) { |
| AddElementwise(y4, params, input1_data_ptr, input2_data_ptr, |
| output_data_ptr); |
| input2_data_ptr += y4; |
| output_data_ptr += y4; |
| } |
| // We have broadcast y4 of input1 data y3 times, and now move on. |
| input1_data_ptr += y4; |
| } |
| } |
| // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. |
| input2_data_reset = input2_data_ptr; |
| } |
| } else { |
| // Special case of y4 == 1, in which the innermost loop is a single element |
| // and can be combined with the next (y3) as an inner broadcast. |
| // |
| // Note that this handles the case of pure scalar broadcast when |
| // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar |
| // broadcast with batch (as y2 > 1). |
| // |
| // NOTE The process is the same as the above general case except simplified |
| // for y4 == 1 and the loop over y3 is contained within the |
| // AddScalarBroadcast function. |
| for (int i0 = 0; i0 < y0; ++i0) { |
| const uint8* input2_data_ptr; |
| for (int i1 = 0; i1 < y1; ++i1) { |
| input2_data_ptr = input2_data_reset; |
| for (int i2 = 0; i2 < y2; ++i2) { |
| AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr, |
| output_data_ptr); |
| input2_data_ptr += y3; |
| output_data_ptr += y3; |
| input1_data_ptr += 1; |
| } |
| } |
| input2_data_reset = input2_data_ptr; |
| } |
| } |
| } |
| |
| } // namespace reference_ops |
| } // namespace tflite |
| |
| #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_ADD_H_ |