| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_ |
| #define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_ |
| |
| #ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK |
| #ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK |
| #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK |
| #endif |
| #endif |
| |
| #include <functional> |
| |
| #include "fixedpoint/fixedpoint.h" |
| #include "tensorflow/lite/kernels/internal/cppmath.h" |
| #include "tensorflow/lite/kernels/internal/optimized/neon_check.h" |
| #include "tensorflow/lite/kernels/internal/types.h" |
| |
| namespace tflite { |
| |
| constexpr int kReverseShift = -1; |
| |
| inline void GetActivationMinMax(FusedActivationFunctionType ac, |
| float* output_activation_min, |
| float* output_activation_max) { |
| switch (ac) { |
| case FusedActivationFunctionType::kNone: |
| *output_activation_min = std::numeric_limits<float>::lowest(); |
| *output_activation_max = std::numeric_limits<float>::max(); |
| break; |
| case FusedActivationFunctionType::kRelu: |
| *output_activation_min = 0.f; |
| *output_activation_max = std::numeric_limits<float>::max(); |
| break; |
| case FusedActivationFunctionType::kRelu1: |
| *output_activation_min = -1.f; |
| *output_activation_max = 1.f; |
| break; |
| case FusedActivationFunctionType::kRelu6: |
| *output_activation_min = 0.f; |
| *output_activation_max = 6.f; |
| break; |
| } |
| } |
| |
| inline float ActivationFunctionWithMinMax(float x, float output_activation_min, |
| float output_activation_max) { |
| return std::min(std::max(x, output_activation_min), output_activation_max); |
| } |
| |
| // Legacy function, left for compatibility only. |
| template <FusedActivationFunctionType Ac> |
| float ActivationFunction(float x) { |
| float output_activation_min, output_activation_max; |
| GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); |
| return ActivationFunctionWithMinMax(x, output_activation_min, |
| output_activation_max); |
| } |
| |
| inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, |
| const float* bias_data, int array_size, |
| float* array_data) { |
| // Note: see b/132215220: in May 2019 we thought it would be OK to replace |
| // this with the Eigen one-liner: |
| // return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max). |
| // This turned out to severely regress performance: +4ms (i.e. 8%) on |
| // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now. |
| TFLITE_DCHECK_EQ((array_size % bias_size), 0); |
| #ifdef USE_NEON |
| float* array_ptr = array_data; |
| float* array_end_ptr = array_ptr + array_size; |
| const auto clamp_min_vec = vdupq_n_f32(clamp_min); |
| const auto clamp_max_vec = vdupq_n_f32(clamp_max); |
| for (; array_ptr != array_end_ptr; array_ptr += bias_size) { |
| int i = 0; |
| for (; i <= bias_size - 16; i += 16) { |
| auto b0 = vld1q_f32(bias_data + i); |
| auto b1 = vld1q_f32(bias_data + i + 4); |
| auto b2 = vld1q_f32(bias_data + i + 8); |
| auto b3 = vld1q_f32(bias_data + i + 12); |
| auto a0 = vld1q_f32(array_ptr + i); |
| auto a1 = vld1q_f32(array_ptr + i + 4); |
| auto a2 = vld1q_f32(array_ptr + i + 8); |
| auto a3 = vld1q_f32(array_ptr + i + 12); |
| auto x0 = vaddq_f32(a0, b0); |
| auto x1 = vaddq_f32(a1, b1); |
| auto x2 = vaddq_f32(a2, b2); |
| auto x3 = vaddq_f32(a3, b3); |
| x0 = vmaxq_f32(clamp_min_vec, x0); |
| x1 = vmaxq_f32(clamp_min_vec, x1); |
| x2 = vmaxq_f32(clamp_min_vec, x2); |
| x3 = vmaxq_f32(clamp_min_vec, x3); |
| x0 = vminq_f32(clamp_max_vec, x0); |
| x1 = vminq_f32(clamp_max_vec, x1); |
| x2 = vminq_f32(clamp_max_vec, x2); |
| x3 = vminq_f32(clamp_max_vec, x3); |
| vst1q_f32(array_ptr + i, x0); |
| vst1q_f32(array_ptr + i + 4, x1); |
| vst1q_f32(array_ptr + i + 8, x2); |
| vst1q_f32(array_ptr + i + 12, x3); |
| } |
| for (; i <= bias_size - 4; i += 4) { |
| auto b = vld1q_f32(bias_data + i); |
| auto a = vld1q_f32(array_ptr + i); |
| auto x = vaddq_f32(a, b); |
| x = vmaxq_f32(clamp_min_vec, x); |
| x = vminq_f32(clamp_max_vec, x); |
| vst1q_f32(array_ptr + i, x); |
| } |
| for (; i < bias_size; i++) { |
| array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], |
| clamp_min, clamp_max); |
| } |
| } |
| #else // not NEON |
| for (int array_offset = 0; array_offset < array_size; |
| array_offset += bias_size) { |
| for (int i = 0; i < bias_size; i++) { |
| array_data[array_offset + i] = ActivationFunctionWithMinMax( |
| array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max); |
| } |
| } |
| #endif |
| } |
| |
| inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp( |
| int32 x, int32 quantized_multiplier, int left_shift) { |
| using gemmlowp::RoundingDivideByPOT; |
| using gemmlowp::SaturatingRoundingDoublingHighMul; |
| return RoundingDivideByPOT( |
| SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); |
| } |
| |
| inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( |
| int32 x, int32 quantized_multiplier, int left_shift) { |
| using gemmlowp::SaturatingRoundingDoublingHighMul; |
| return SaturatingRoundingDoublingHighMul(x * (1 << left_shift), |
| quantized_multiplier); |
| } |
| |
| inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier, |
| int shift) { |
| using gemmlowp::RoundingDivideByPOT; |
| using gemmlowp::SaturatingRoundingDoublingHighMul; |
| int left_shift = shift > 0 ? shift : 0; |
| int right_shift = shift > 0 ? 0 : -shift; |
| return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul( |
| x * (1 << left_shift), quantized_multiplier), |
| right_shift); |
| } |
| |
| inline int32 MultiplyByQuantizedMultiplier(int64_t x, |
| int32 quantized_multiplier, |
| int shift) { |
| // Inputs: |
| // - quantized_multiplier has fixed point at bit 31 |
| // - shift is -31 to +7 (negative for right shift) |
| // |
| // Assumptions: The following input ranges are assumed |
| // - quantize_scale>=0 (the usual range is (1<<30) to (1>>31)-1) |
| // - scaling is chosen so final scaled result fits in int32 |
| // - input x is in the range -(1<<47) <= x < (1<<47) |
| assert(quantized_multiplier >= 0); |
| assert(shift >= -31 && shift < 8); |
| |
| int32_t reduced_multiplier = (quantized_multiplier + (1 << 15)) >> 16; |
| int total_shift = 15 - shift; |
| x = (x * (int64_t)reduced_multiplier) + ((int64_t)1 << (total_shift - 1)); |
| int32_t result = x >> total_shift; |
| return result; |
| } |
| |
| template <typename T> |
| int CountLeadingZeros(T integer_input) { |
| static_assert(std::is_unsigned<T>::value, |
| "Only unsigned integer types handled."); |
| #if defined(__GNUC__) |
| return integer_input ? __builtin_clz(integer_input) |
| : std::numeric_limits<T>::digits; |
| #else |
| if (integer_input == 0) { |
| return std::numeric_limits<T>::digits; |
| } |
| |
| const T one_in_leading_positive = static_cast<T>(1) |
| << (std::numeric_limits<T>::digits - 1); |
| int leading_zeros = 0; |
| while (integer_input < one_in_leading_positive) { |
| integer_input <<= 1; |
| ++leading_zeros; |
| } |
| return leading_zeros; |
| #endif |
| } |
| |
| template <typename T> |
| inline int CountLeadingSignBits(T integer_input) { |
| static_assert(std::is_signed<T>::value, "Only signed integer types handled."); |
| #if defined(__GNUC__) && !defined(__clang__) |
| return integer_input ? __builtin_clrsb(integer_input) |
| : std::numeric_limits<T>::digits; |
| #else |
| using U = typename std::make_unsigned<T>::type; |
| return integer_input >= 0 |
| ? CountLeadingZeros(static_cast<U>(integer_input)) - 1 |
| : integer_input != std::numeric_limits<T>::min() |
| ? CountLeadingZeros(2 * static_cast<U>(-integer_input) - 1) |
| : 0; |
| #endif |
| } |
| |
| // Use "count leading zeros" helper functions to do a fast Floor(log_2(x)). |
| template <typename Integer> |
| inline Integer FloorLog2(Integer n) { |
| static_assert(std::is_integral<Integer>::value, ""); |
| static_assert(std::is_signed<Integer>::value, ""); |
| static_assert(sizeof(Integer) == 4 || sizeof(Integer) == 8, ""); |
| TFLITE_CHECK_GT(n, 0); |
| if (sizeof(Integer) == 4) { |
| return 30 - CountLeadingSignBits(n); |
| } else { |
| return 62 - CountLeadingSignBits(n); |
| } |
| } |
| |
| // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in |
| // softmax |
| inline void gen_lut(const std::function<double(double)>& func, double min, |
| double max, int16_t* table, const int num) { |
| // size of table should equal to num + 1 |
| // last element only for slope calculation |
| double step = (max - min) / (num - 1); |
| double half_step = step / 2.0; |
| for (int i = 0; i < num - 1; i++) { |
| double sample_val = TfLiteRound(func(min + i * step) * 32768.0); |
| double midpoint_interp_val = |
| TfLiteRound((func(min + (i + 1) * step) * 32768.0 + |
| TfLiteRound(func(min + i * step) * 32768.0)) / |
| 2.0); |
| double midpoint_val = |
| TfLiteRound(func(min + i * step + half_step) * 32768.0); |
| double midpoint_err = midpoint_interp_val - midpoint_val; |
| double bias = TfLiteRound(midpoint_err / 2.0); |
| table[i] = std::min(std::max(sample_val - bias, -32768.0), 32767.0); |
| } |
| table[num - 1] = |
| std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0); |
| } |
| |
| // int16 func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax |
| inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) { |
| // 512 base value, lut[513] only for calculate slope |
| uint16_t index = static_cast<uint16_t>(256 + (value >> 7)); |
| assert(index < 512 && "LUT index out of range."); |
| int16_t offset = value & 0x7f; |
| |
| // base and slope are Q0.15 |
| int16_t base = lut[index]; |
| int16_t slope = lut[index + 1] - lut[index]; |
| |
| // Q0.15 * Q0.7 = Q0.22 |
| // Round and convert from Q0.22 to Q0.15 |
| int32_t delta = (static_cast<int32_t>(slope) * offset + 64) >> 7; |
| |
| // Q0.15 + Q0.15 |
| return base + delta; |
| } |
| |
| // Table of sigmoid(i/24) at 0.16 format - 256 elements. |
| |
| // We use combined sigmoid and tanh look-up table, since |
| // tanh(x) = 2*sigmoid(2*x) -1. |
| // Both functions are symmetric, so the LUT table is only needed |
| // for the absolute value of the input. |
| static const uint16_t sigmoid_table_uint16[256] = { |
| 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498, |
| 40149, 40794, 41432, 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255, |
| 46817, 47369, 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, 51865, |
| 52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485, 55834, 56174, |
| 56503, 56823, 57133, 57433, 57724, 58007, 58280, 58544, 58800, 59048, 59288, |
| 59519, 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, 61279, 61441, |
| 61599, 61750, 61896, 62036, 62172, 62302, 62428, 62549, 62666, 62778, 62886, |
| 62990, 63090, 63186, 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, |
| 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, 64357, 64405, 64450, |
| 64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845, |
| 64873, 64900, 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097, |
| 65115, 65132, 65149, 65164, 65179, 65194, 65208, 65221, 65234, 65246, 65258, |
| 65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337, 65345, 65352, 65360, |
| 65367, 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425, |
| 65429, 65433, 65438, 65442, 65445, 65449, 65453, 65456, 65459, 65462, 65465, |
| 65468, 65471, 65474, 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, |
| 65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, 65505, 65507, 65508, |
| 65509, 65510, 65511, 65512, 65513, 65514, 65515, 65516, 65517, 65517, 65518, |
| 65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, 65525, |
| 65525, 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529, |
| 65529, 65529, 65530, 65530, 65530, 65530, 65531, 65531, 65531, 65531, 65531, |
| 65532, 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, 65533, 65533, |
| 65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65534, |
| 65534, 65534, 65535}; |
| |
| // TODO(b/77858996): Add these to gemmlowp. |
| template <typename IntegerType> |
| IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) { |
| static_assert(std::is_same<IntegerType, void>::value, "unimplemented"); |
| return a; |
| } |
| |
| template <> |
| inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) { |
| std::int64_t a64 = a; |
| std::int64_t b64 = b; |
| std::int64_t sum = a64 + b64; |
| return static_cast<std::int32_t>(std::min( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()), |
| std::max( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()), |
| sum))); |
| } |
| |
| template <typename tRawType, int tIntegerBits> |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp( |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> a, |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { |
| return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( |
| SaturatingAddNonGemmlowp(a.raw(), b.raw())); |
| } |
| |
| template <typename IntegerType> |
| IntegerType SaturatingSub(IntegerType a, IntegerType b) { |
| static_assert(std::is_same<IntegerType, void>::value, "unimplemented"); |
| return a; |
| } |
| |
| template <> |
| inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) { |
| std::int32_t a32 = a; |
| std::int32_t b32 = b; |
| std::int32_t diff = a32 - b32; |
| return static_cast<std::int16_t>( |
| std::min(static_cast<int32_t>(32767), |
| std::max(static_cast<int32_t>(-32768), diff))); |
| } |
| |
| template <> |
| inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) { |
| std::int64_t a64 = a; |
| std::int64_t b64 = b; |
| std::int64_t diff = a64 - b64; |
| return static_cast<std::int32_t>(std::min( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()), |
| std::max( |
| static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()), |
| diff))); |
| } |
| |
| template <typename tRawType, int tIntegerBits> |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub( |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> a, |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> b) { |
| return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( |
| SaturatingSub(a.raw(), b.raw())); |
| } |
| // End section to be moved to gemmlowp. |
| |
| template <typename IntegerType> |
| IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) { |
| if (exponent == 0) { |
| return x; |
| } |
| using ScalarIntegerType = |
| typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType; |
| const IntegerType min = |
| gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min()); |
| const IntegerType max = |
| gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max()); |
| const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType); |
| |
| const std::int32_t threshold = |
| ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1); |
| const IntegerType positive_mask = |
| gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold)); |
| const IntegerType negative_mask = |
| gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold)); |
| |
| IntegerType result = gemmlowp::ShiftLeft(x, exponent); |
| result = gemmlowp::SelectUsingMask(positive_mask, max, result); |
| result = gemmlowp::SelectUsingMask(negative_mask, min, result); |
| return result; |
| } |
| |
| // If we want to leave IntegerBits fixed, then multiplication |
| // by a power of two has to be saturating/rounding, not exact anymore. |
| template <typename tRawType, int tIntegerBits> |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> |
| SaturatingRoundingMultiplyByPOTParam( |
| gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) { |
| return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw( |
| SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent)); |
| } |
| |
| // Minimum output bits to accommodate log of maximum input range. It actually |
| // does not matter if one considers, say, [-64,64] or [-64,64). |
| // |
| // For example, run this through Octave: |
| // [0:127; ... |
| // ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ... |
| // ceil(log(abs( log(2.^(0:127))+1 ))/log(2))] |
| constexpr int min_log_x_output_bits(int input_bits) { |
| return input_bits > 90 |
| ? 7 |
| : input_bits > 44 |
| ? 6 |
| : input_bits > 21 |
| ? 5 |
| : input_bits > 10 |
| ? 4 |
| : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1; |
| } |
| |
| // Although currently the name of this function says that it cannot handle |
| // values less than 1, in practice it can handle as low as 1/x_max, where |
| // x_max is the largest representable input. In other words, the output range |
| // is symmetric. |
| template <int OutputIntegerBits, int InputIntegerBits> |
| inline gemmlowp::FixedPoint<int32, OutputIntegerBits> |
| log_x_for_x_greater_than_or_equal_to_1_impl( |
| gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { |
| // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1); |
| // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits); |
| using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; |
| // The reason for accumulating the result with an extra bit of headroom is |
| // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled * |
| // recip_denom will otherwise introduce an error. |
| static constexpr int kAccumIntegerBits = OutputIntegerBits + 1; |
| using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>; |
| |
| const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1488522236, std::log(2.0)); |
| const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5))); |
| const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1518500250, std::sqrt(0.5)); |
| const FixedPoint0 one_quarter = |
| GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0); |
| |
| const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0))); |
| const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0))); |
| const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 1057819769, |
| 2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0))); |
| const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT( |
| FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0))); |
| |
| const FixedPointAccum shifted_quarter = |
| gemmlowp::Rescale<kAccumIntegerBits>(one_quarter); |
| |
| // Reinterpret the input value as Q0.31, because we will figure out the |
| // required shift "ourselves" instead of using, say, Rescale. |
| FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw()); |
| // z_a_pow_2 = input_integer_bits - z_a_headroom; |
| int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw())); |
| FixedPoint0 r_a_tmp = |
| SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1)); |
| const int32 r_a_raw = |
| SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1); |
| // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25); |
| // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25, |
| // InputIntegerBits - z_b_headroom - 0.25); |
| const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp( |
| FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( |
| InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)), |
| shifted_quarter); |
| |
| // z_b is treated like z_a, but premultiplying by sqrt(0.5). |
| FixedPoint0 z_b = z_a * sqrt_half; |
| int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1; |
| const int32 r_b_raw = |
| SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom); |
| const FixedPointAccum z_b_pow_2_adj = SaturatingSub( |
| FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam( |
| InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)), |
| shifted_quarter); |
| |
| const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw)); |
| const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw( |
| std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw())); |
| |
| const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half); |
| FixedPoint0 q = r - sqrt_sqrt_half; |
| q = q + q; |
| |
| const FixedPoint0 common_sq = q * q; |
| const FixedPoint0 num = q * r + q * common_sq * alpha_n; |
| const FixedPoint0 denom_minus_one_0 = |
| p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q; |
| const FixedPoint0 recip_denom = |
| one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0); |
| |
| const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num); |
| return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 + |
| num_scaled * recip_denom); |
| } |
| |
| template <int OutputIntegerBits, int InputIntegerBits> |
| inline gemmlowp::FixedPoint<int32, OutputIntegerBits> |
| log_x_for_x_greater_than_or_equal_to_1( |
| gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) { |
| static_assert( |
| OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits), |
| "Output integer bits must be sufficient to accommodate logs of inputs."); |
| return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits, |
| InputIntegerBits>( |
| input_val); |
| } |
| |
| inline int32 GetReciprocal(int32 x, int x_integer_digits, |
| int* num_bits_over_unit) { |
| int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x)); |
| // This is the number of bits to the left of the binary point above 1.0. |
| // Consider x=1.25. In that case shifted_scale=0.8 and |
| // no later adjustment will be needed. |
| *num_bits_over_unit = x_integer_digits - headroom_plus_one; |
| const int32 shifted_sum_minus_one = |
| static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) - |
| (static_cast<uint32>(1) << 31)); |
| |
| gemmlowp::FixedPoint<int32, 0> shifted_scale = |
| gemmlowp::one_over_one_plus_x_for_x_in_0_1( |
| gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one)); |
| return shifted_scale.raw(); |
| } |
| |
| inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift, |
| int32* output_inv_sqrt, |
| int* output_shift) { |
| TFLITE_DCHECK_GE(input, 0); |
| if (input <= 1) { |
| // Handle the input value 1 separately to avoid overflow in that case |
| // in the general computation below (b/143972021). Also handle 0 as if it |
| // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid |
| // but rare/unrealistic input value. We can expect both to occur in some |
| // incompletely trained models, but probably not in fully trained models. |
| *output_inv_sqrt = std::numeric_limits<std::int32_t>::max(); |
| *output_shift = 0; |
| return; |
| } |
| TFLITE_DCHECK_GT(input, 1); |
| *output_shift = 11; |
| while (input >= (1 << 29)) { |
| input /= 4; |
| ++*output_shift; |
| } |
| const unsigned max_left_shift_bits = |
| CountLeadingZeros(static_cast<uint32>(input)) - 1; |
| const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; |
| const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; |
| *output_shift -= left_shift_bit_pairs; |
| input <<= 2 * left_shift_bit_pairs; |
| TFLITE_DCHECK_GE(input, (1 << 27)); |
| TFLITE_DCHECK_LT(input, (1 << 29)); |
| using gemmlowp::FixedPoint; |
| using gemmlowp::Rescale; |
| using gemmlowp::SaturatingRoundingMultiplyByPOT; |
| // Using 3 integer bits gives us enough room for the internal arithmetic in |
| // this Newton-Raphson iteration. |
| using F3 = FixedPoint<int32, 3>; |
| using F0 = FixedPoint<int32, 0>; |
| const F3 fixedpoint_input = F3::FromRaw(input >> 1); |
| const F3 fixedpoint_half_input = |
| SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); |
| const F3 fixedpoint_half_three = |
| GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); |
| // Newton-Raphson iteration |
| // Naive unoptimized starting guess: x = 1 |
| F3 x = F3::One(); |
| // Naive unoptimized number of iterations: 5 |
| for (int i = 0; i < 5; i++) { |
| const F3 x3 = Rescale<3>(x * x * x); |
| x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); |
| } |
| const F0 fixedpoint_half_sqrt_2 = |
| GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); |
| x = x * fixedpoint_half_sqrt_2; |
| *output_inv_sqrt = x.raw(); |
| if (*output_shift < 0) { |
| *output_inv_sqrt <<= -*output_shift; |
| *output_shift = 0; |
| } |
| // Convert right shift (right is positive) to left shift. |
| *output_shift *= reverse_shift; |
| } |
| |
| // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING |
| // BROADCASTING. |
| // |
| // NdArrayDesc<N> describes the shape and memory layout of an N-dimensional |
| // rectangular array of numbers. |
| // |
| // NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. |
| // However, as Dims<N> is to be deprecated, this class exists as an adaptor |
| // to enable simple unoptimized implementations of element-wise broadcasting |
| // operations. |
| template <int N> |
| struct NdArrayDesc { |
| // The "extent" of each dimension. Indices along dimension d must be in the |
| // half-open interval [0, extents[d]). |
| int extents[N]; |
| |
| // The number of *elements* (not bytes) between consecutive indices of each |
| // dimension. |
| int strides[N]; |
| }; |
| |
| // DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING |
| // BROADCASTING. |
| // |
| // Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. |
| inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, |
| int i3) { |
| TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); |
| TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); |
| TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); |
| TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); |
| return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + |
| i3 * desc.strides[3]; |
| } |
| |
| inline int SubscriptToIndex(const NdArrayDesc<5>& desc, int indexes[5]) { |
| return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] + |
| indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] + |
| indexes[4] * desc.strides[4]; |
| } |
| |
| // Given the dimensions of the operands for an element-wise binary broadcast, |
| // adjusts them so that they can be directly iterated over with simple loops. |
| // Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and |
| // 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. |
| // |
| // This function assumes that the two input shapes are compatible up to |
| // broadcasting and the shorter one has already been prepended with 1s to be the |
| // same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), |
| // shape1 must already have been prepended to be (1, 1, 1, 64). Recall that |
| // Dims<N> refer to shapes in reverse order. In this case, input0_dims will be |
| // (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). |
| // |
| // When two shapes are compatible up to broadcasting, for each dimension d, |
| // the input extents are either equal, or one of them is 1. |
| // |
| // This function performs the following for each dimension d: |
| // - If the extents are equal, then do nothing since the loop that walks over |
| // both of the input arrays is correct. |
| // - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 |
| // and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows |
| // array0 to be referenced *at any index* in dimension d and still access the |
| // same slice. |
| template <int N> |
| inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, |
| const Dims<N>& input1_dims, |
| NdArrayDesc<N>* desc0_out, |
| NdArrayDesc<N>* desc1_out) { |
| TFLITE_DCHECK(desc0_out != nullptr); |
| TFLITE_DCHECK(desc1_out != nullptr); |
| |
| // Copy dims to desc. |
| for (int i = 0; i < N; ++i) { |
| desc0_out->extents[i] = input0_dims.sizes[i]; |
| desc0_out->strides[i] = input0_dims.strides[i]; |
| desc1_out->extents[i] = input1_dims.sizes[i]; |
| desc1_out->strides[i] = input1_dims.strides[i]; |
| } |
| |
| // Walk over each dimension. If the extents are equal do nothing. |
| // Otherwise, set the desc with extent 1 to have extent equal to the other and |
| // stride 0. |
| for (int i = 0; i < N; ++i) { |
| const int extent0 = ArraySize(input0_dims, i); |
| const int extent1 = ArraySize(input1_dims, i); |
| if (extent0 != extent1) { |
| if (extent0 == 1) { |
| desc0_out->strides[i] = 0; |
| desc0_out->extents[i] = extent1; |
| } else { |
| TFLITE_DCHECK_EQ(extent1, 1); |
| desc1_out->strides[i] = 0; |
| desc1_out->extents[i] = extent0; |
| } |
| } |
| } |
| } |
| |
| // Copies dims to desc, calculating strides. |
| template <int N> |
| inline void CopyDimsToDesc(const RuntimeShape& input_shape, |
| NdArrayDesc<N>* desc_out) { |
| int desc_stride = 1; |
| for (int i = N - 1; i >= 0; --i) { |
| desc_out->extents[i] = input_shape.Dims(i); |
| desc_out->strides[i] = desc_stride; |
| desc_stride *= input_shape.Dims(i); |
| } |
| } |
| |
| template <int N> |
| inline void NdArrayDescsForElementwiseBroadcast( |
| const RuntimeShape& input0_shape, const RuntimeShape& input1_shape, |
| NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) { |
| TFLITE_DCHECK(desc0_out != nullptr); |
| TFLITE_DCHECK(desc1_out != nullptr); |
| |
| auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape); |
| auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape); |
| |
| // Copy dims to desc, calculating strides. |
| CopyDimsToDesc<N>(extended_input0_shape, desc0_out); |
| CopyDimsToDesc<N>(extended_input1_shape, desc1_out); |
| |
| // Walk over each dimension. If the extents are equal do nothing. |
| // Otherwise, set the desc with extent 1 to have extent equal to the other and |
| // stride 0. |
| for (int i = 0; i < N; ++i) { |
| const int extent0 = extended_input0_shape.Dims(i); |
| const int extent1 = extended_input1_shape.Dims(i); |
| if (extent0 != extent1) { |
| if (extent0 == 1) { |
| desc0_out->strides[i] = 0; |
| desc0_out->extents[i] = extent1; |
| } else { |
| TFLITE_DCHECK_EQ(extent1, 1); |
| desc1_out->strides[i] = 0; |
| desc1_out->extents[i] = extent0; |
| } |
| } |
| } |
| } |
| |
| template <int N> |
| inline void NdArrayDescsForElementwiseBroadcast( |
| const RuntimeShape& input0_shape, const RuntimeShape& input1_shape, |
| const RuntimeShape& input2_shape, NdArrayDesc<N>* desc0_out, |
| NdArrayDesc<N>* desc1_out, NdArrayDesc<N>* desc2_out) { |
| TFLITE_DCHECK(desc0_out != nullptr); |
| TFLITE_DCHECK(desc1_out != nullptr); |
| TFLITE_DCHECK(desc2_out != nullptr); |
| |
| auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape); |
| auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape); |
| auto extended_input2_shape = RuntimeShape::ExtendedShape(N, input2_shape); |
| |
| // Copy dims to desc, calculating strides. |
| CopyDimsToDesc<N>(extended_input0_shape, desc0_out); |
| CopyDimsToDesc<N>(extended_input1_shape, desc1_out); |
| CopyDimsToDesc<N>(extended_input2_shape, desc2_out); |
| |
| // Walk over each dimension. If the extents are equal do nothing. |
| // Otherwise, set the desc with extent 1 to have extent equal to the other and |
| // stride 0. |
| for (int i = 0; i < N; ++i) { |
| const int extent0 = extended_input0_shape.Dims(i); |
| const int extent1 = extended_input1_shape.Dims(i); |
| const int extent2 = extended_input2_shape.Dims(i); |
| |
| int extent = extent0; |
| if (extent1 != 1) extent = extent1; |
| if (extent2 != 1) extent = extent2; |
| |
| TFLITE_DCHECK(extent0 == 1 || extent0 == extent); |
| TFLITE_DCHECK(extent1 == 1 || extent1 == extent); |
| TFLITE_DCHECK(extent2 == 1 || extent2 == extent); |
| |
| if (!(extent0 == extent1 && extent1 == extent2)) { |
| if (extent0 == 1) { |
| desc0_out->strides[i] = 0; |
| desc0_out->extents[i] = extent; |
| } |
| if (extent1 == 1) { |
| desc1_out->strides[i] = 0; |
| desc1_out->extents[i] = extent; |
| } |
| if (extent2 == 1) { |
| desc2_out->strides[i] = 0; |
| desc2_out->extents[i] = extent; |
| } |
| } |
| } |
| } |
| |
| // Detailed implementation of NDOpsHelper, the indexes must be a zero array. |
| // This implementation is equivalent to N nested loops. Ex, if N=4, it can be |
| // re-writen as: |
| // for (int b = 0; b < output.extents[0]; ++b) { |
| // for (int y = 0; y < output.extents[1]; ++y) { |
| // for (int x = 0; x < output.extents[2]; ++x) { |
| // for (int c = 0; c < output.extents[3]; ++c) { |
| // calc({b,y,x,c}); |
| // } |
| // } |
| // } |
| // } |
| template <int N, int DIM, typename Calc> |
| typename std::enable_if<DIM != N - 1, void>::type NDOpsHelperImpl( |
| const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) { |
| for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM]) { |
| NDOpsHelperImpl<N, DIM + 1, Calc>(output, calc, indexes); |
| } |
| } |
| |
| template <int N, int DIM, typename Calc> |
| typename std::enable_if<DIM == N - 1, void>::type NDOpsHelperImpl( |
| const NdArrayDesc<N>& output, const Calc& calc, int indexes[N]) { |
| for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM]) { |
| calc(indexes); |
| } |
| } |
| |
| // Execute the calc function in the innermost iteration based on the shape of |
| // the output. The calc function should take a single argument of type int[N]. |
| template <int N, typename Calc> |
| inline void NDOpsHelper(const NdArrayDesc<N>& output, const Calc& calc) { |
| int indexes[N] = {0}; |
| NDOpsHelperImpl<N, 0, Calc>(output, calc, indexes); |
| } |
| // Copied from gemmlowp::RoundDown when we dropped direct dependency on |
| // gemmlowp. |
| // |
| // Returns the runtime argument rounded down to the nearest multiple of |
| // the fixed Modulus. |
| template <unsigned Modulus, typename Integer> |
| Integer RoundDown(Integer i) { |
| return i - (i % Modulus); |
| } |
| |
| // Copied from gemmlowp::RoundUp when we dropped direct dependency on |
| // gemmlowp. |
| // |
| // Returns the runtime argument rounded up to the nearest multiple of |
| // the fixed Modulus. |
| template <unsigned Modulus, typename Integer> |
| Integer RoundUp(Integer i) { |
| return RoundDown<Modulus>(i + Modulus - 1); |
| } |
| |
| // Copied from gemmlowp::CeilQuotient when we dropped direct dependency on |
| // gemmlowp. |
| // |
| // Returns the quotient a / b rounded up ('ceil') to the nearest integer. |
| template <typename Integer> |
| Integer CeilQuotient(Integer a, Integer b) { |
| return (a + b - 1) / b; |
| } |
| |
| // This function is a copy of gemmlowp::HowManyThreads, copied when we dropped |
| // the direct dependency of internal/optimized/ on gemmlowp. |
| // |
| // It computes a reasonable number of threads to use for a GEMM of shape |
| // (rows, cols, depth). |
| // |
| // TODO(b/131910176): get rid of this function by switching each call site |
| // to its own more sensible logic for its own workload. |
| template <int KernelRows> |
| inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols, |
| int depth) { |
| // Early-exit in the default case where multi-threading is disabled. |
| if (max_num_threads == 1) { |
| return 1; |
| } |
| |
| // Ensure that each thread has KernelRows rows to process, if at all possible. |
| int thread_count = std::min(max_num_threads, rows / KernelRows); |
| |
| // Limit the number of threads according to the overall size of the problem. |
| if (thread_count > 1) { |
| // Empirically determined value. |
| static constexpr std::uint64_t min_cubic_size_per_thread = 64 * 1024; |
| |
| // We can only multiply two out of three sizes without risking overflow |
| const std::uint64_t cubic_size = |
| std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth); |
| |
| thread_count = std::min( |
| thread_count, static_cast<int>(cubic_size / min_cubic_size_per_thread)); |
| } |
| |
| if (thread_count < 1) { |
| thread_count = 1; |
| } |
| |
| assert(thread_count > 0 && thread_count <= max_num_threads); |
| return thread_count; |
| } |
| |
| template <typename T> |
| void optimized_ops_preload_l1_stream(const T* ptr) { |
| #ifdef __GNUC__ |
| // builtin offered by GCC-compatible compilers including clang |
| __builtin_prefetch(ptr, /* 0 means read */ 0, /* 0 means no locality */ 0); |
| #else |
| (void)ptr; |
| #endif |
| } |
| |
| template <typename T> |
| void optimized_ops_preload_l1_keep(const T* ptr) { |
| #ifdef __GNUC__ |
| // builtin offered by GCC-compatible compilers including clang |
| __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3); |
| #else |
| (void)ptr; |
| #endif |
| } |
| |
| template <typename T> |
| void optimized_ops_prefetch_write_l1_keep(const T* ptr) { |
| #ifdef __GNUC__ |
| // builtin offered by GCC-compatible compilers including clang |
| __builtin_prefetch(ptr, /* 1 means write */ 1, /* 3 means high locality */ 3); |
| #else |
| (void)ptr; |
| #endif |
| } |
| |
| } // namespace tflite |
| |
| #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_ |