| /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ |
| #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ |
| |
| #include "tensorflow/lite/kernels/internal/common.h" |
| #include "tensorflow/lite/kernels/internal/cppmath.h" |
| #include "tensorflow/lite/kernels/internal/quantization_util.h" |
| #include "tensorflow/lite/kernels/internal/types.h" |
| |
| namespace tflite { |
| namespace reference_ops { |
| |
| inline void FullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const float* input_data, const RuntimeShape& weights_shape, |
| const float* weights_data, const RuntimeShape& bias_shape, |
| const float* bias_data, const RuntimeShape& output_shape, |
| float* output_data) { |
| const float output_activation_min = params.float_activation_min; |
| const float output_activation_max = params.float_activation_max; |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dims_count = output_shape.DimensionsCount(); |
| const int weights_dims_count = weights_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); |
| const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2, |
| output_shape, output_dims_count - 1); |
| const int accum_depth = weights_shape.Dims(weights_dims_count - 1); |
| for (int b = 0; b < batches; ++b) { |
| for (int out_c = 0; out_c < output_depth; ++out_c) { |
| float total = 0.f; |
| for (int d = 0; d < accum_depth; ++d) { |
| total += input_data[b * accum_depth + d] * |
| weights_data[out_c * accum_depth + d]; |
| } |
| float bias_value = 0.0f; |
| if (bias_data) { |
| bias_value = bias_data[out_c]; |
| } |
| output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( |
| total + bias_value, output_activation_min, output_activation_max); |
| } |
| } |
| } |
| |
| inline void FullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& filter_shape, |
| const uint8* filter_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| uint8* output_data) { |
| const int32 input_offset = params.input_offset; |
| const int32 filter_offset = params.weights_offset; |
| const int32 output_offset = params.output_offset; |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2); |
| TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); |
| |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dim_count = output_shape.DimensionsCount(); |
| const int filter_dim_count = filter_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); |
| const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, |
| output_shape, output_dim_count - 1); |
| const int accum_depth = filter_shape.Dims(filter_dim_count - 1); |
| for (int b = 0; b < batches; ++b) { |
| for (int out_c = 0; out_c < output_depth; ++out_c) { |
| int32 acc = 0; |
| for (int d = 0; d < accum_depth; ++d) { |
| int32 input_val = input_data[b * accum_depth + d]; |
| int32 filter_val = filter_data[out_c * accum_depth + d]; |
| acc += (filter_val + filter_offset) * (input_val + input_offset); |
| } |
| if (bias_data) { |
| acc += bias_data[out_c]; |
| } |
| acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); |
| acc += output_offset; |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_data[out_c + output_depth * b] = static_cast<uint8>(acc); |
| } |
| } |
| } |
| |
| inline void FullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& filter_shape, |
| const uint8* filter_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| int16* output_data) { |
| const int32 input_offset = params.input_offset; |
| const int32 filter_offset = params.weights_offset; |
| const int32 output_offset = params.output_offset; |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| TFLITE_DCHECK_EQ(output_offset, 0); |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dim_count = output_shape.DimensionsCount(); |
| const int filter_dim_count = filter_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); |
| const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2, |
| output_shape, output_dim_count - 1); |
| const int accum_depth = filter_shape.Dims(filter_dim_count - 1); |
| for (int b = 0; b < batches; ++b) { |
| for (int out_c = 0; out_c < output_depth; ++out_c) { |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum = bias_data[out_c]; |
| // Accumulation loop. |
| for (int d = 0; d < accum_depth; ++d) { |
| int16 input_val = input_data[b * accum_depth + d] + input_offset; |
| int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset; |
| accum += filter_val * input_val; |
| } |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, typically 3 integer bits) fixed-point format. The quantized |
| // multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| accum = |
| MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift); |
| // Saturate, cast to int16, and store to output array. |
| accum = std::max(accum, output_activation_min - output_offset); |
| accum = std::min(accum, output_activation_max - output_offset); |
| accum += output_offset; |
| output_data[out_c + output_depth * b] = accum; |
| } |
| } |
| } |
| |
| inline void ShuffledFullyConnected( |
| const FullyConnectedParams& params, const RuntimeShape& input_shape, |
| const uint8* input_data, const RuntimeShape& weights_shape, |
| const uint8* shuffled_weights_data, const RuntimeShape& bias_shape, |
| const int32* bias_data, const RuntimeShape& output_shape, |
| int16* output_data, uint8* shuffled_input_workspace_data) { |
| const int32 output_multiplier = params.output_multiplier; |
| const int output_shift = params.output_shift; |
| const int32 output_activation_min = params.quantized_activation_min; |
| const int32 output_activation_max = params.quantized_activation_max; |
| TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| |
| TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1); |
| TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2); |
| TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1); |
| // TODO(benoitjacob): This really should be: |
| // const int batches = ArraySize(output_dims, 1); |
| // but the current --variable_batch hack consists in overwriting the 3rd |
| // dimension with the runtime batch size, as we don't keep track for each |
| // array of which dimension is the batch dimension in it. |
| const int output_dim_count = output_shape.DimensionsCount(); |
| const int weights_dim_count = weights_shape.DimensionsCount(); |
| const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); |
| const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2, |
| output_shape, output_dim_count - 1); |
| const int accum_depth = weights_shape.Dims(weights_dim_count - 1); |
| TFLITE_DCHECK((accum_depth % 16) == 0); |
| TFLITE_DCHECK((output_depth % 4) == 0); |
| |
| // Shuffling and xoring of input activations into the workspace buffer |
| uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data; |
| if (batches == 1) { |
| for (int i = 0; i < accum_depth; i++) { |
| shuffled_input_workspace_data[i] = input_data[i] ^ 0x80; |
| } |
| } else if (batches == 4) { |
| for (int c = 0; c < accum_depth; c += 16) { |
| for (int b = 0; b < 4; b++) { |
| const uint8* src_data_ptr = input_data + b * accum_depth + c; |
| for (int j = 0; j < 16; j++) { |
| uint8 src_val = *src_data_ptr++; |
| // Flip the sign bit, so that the kernel will only need to |
| // reinterpret these uint8 values as int8, getting for free the |
| // subtraction of the zero_point value 128. |
| uint8 dst_val = src_val ^ 0x80; |
| *shuffled_input_workspace_ptr++ = dst_val; |
| } |
| } |
| } |
| } else { |
| TFLITE_DCHECK(false); |
| return; |
| } |
| |
| // Actual computation |
| if (batches == 1) { |
| int16* output_ptr = output_data; |
| // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) |
| // so that just reinterpreting them as int8 values is equivalent to |
| // subtracting 128 from them, thus implementing for free the subtraction of |
| // the zero_point value 128. |
| const int8* shuffled_weights_ptr = |
| reinterpret_cast<const int8*>(shuffled_weights_data); |
| // Likewise, we preshuffled and pre-xored the input data above. |
| const int8* shuffled_input_data = |
| reinterpret_cast<const int8*>(shuffled_input_workspace_data); |
| for (int c = 0; c < output_depth; c += 4) { |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum[4] = {0}; |
| // Accumulation loop. |
| for (int d = 0; d < accum_depth; d += 16) { |
| for (int i = 0; i < 4; i++) { |
| for (int j = 0; j < 16; j++) { |
| int8 input_val = shuffled_input_data[d + j]; |
| int8 weights_val = *shuffled_weights_ptr++; |
| accum[i] += weights_val * input_val; |
| } |
| } |
| } |
| for (int i = 0; i < 4; i++) { |
| // Add bias value |
| int32 acc = accum[i] + bias_data[c + i]; |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, typically 3 integer bits) fixed-point format. The quantized |
| // multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| acc = |
| MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); |
| // Saturate, cast to int16, and store to output array. |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_ptr[c + i] = acc; |
| } |
| } |
| } else if (batches == 4) { |
| int16* output_ptr = output_data; |
| // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd) |
| // so that just reinterpreting them as int8 values is equivalent to |
| // subtracting 128 from them, thus implementing for free the subtraction of |
| // the zero_point value 128. |
| const int8* shuffled_weights_ptr = |
| reinterpret_cast<const int8*>(shuffled_weights_data); |
| // Likewise, we preshuffled and pre-xored the input data above. |
| const int8* shuffled_input_data = |
| reinterpret_cast<const int8*>(shuffled_input_workspace_data); |
| for (int c = 0; c < output_depth; c += 4) { |
| const int8* shuffled_input_ptr = shuffled_input_data; |
| // Accumulation loop. |
| // Internal accumulation. |
| // Initialize accumulator with the bias-value. |
| int32 accum[4][4]; |
| for (int i = 0; i < 4; i++) { |
| for (int b = 0; b < 4; b++) { |
| accum[i][b] = 0; |
| } |
| } |
| for (int d = 0; d < accum_depth; d += 16) { |
| for (int i = 0; i < 4; i++) { |
| for (int b = 0; b < 4; b++) { |
| for (int j = 0; j < 16; j++) { |
| int8 input_val = shuffled_input_ptr[16 * b + j]; |
| int8 weights_val = shuffled_weights_ptr[16 * i + j]; |
| accum[i][b] += weights_val * input_val; |
| } |
| } |
| } |
| shuffled_input_ptr += 64; |
| shuffled_weights_ptr += 64; |
| } |
| for (int i = 0; i < 4; i++) { |
| for (int b = 0; b < 4; b++) { |
| // Add bias value |
| int32 acc = accum[i][b] + bias_data[c + i]; |
| // Down-scale the final int32 accumulator to the scale used by our |
| // (16-bit, typically 3 integer bits) fixed-point format. The |
| // quantized multiplier and shift here have been pre-computed offline |
| // (e.g. by toco). |
| acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, |
| output_shift); |
| // Saturate, cast to int16, and store to output array. |
| acc = std::max(acc, output_activation_min); |
| acc = std::min(acc, output_activation_max); |
| output_ptr[b * output_depth + c + i] = acc; |
| } |
| } |
| } |
| } else { |
| TFLITE_DCHECK(false); |
| return; |
| } |
| } |
| |
| } // namespace reference_ops |
| } // namespace tflite |
| |
| #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_ |