coral / imx-gst-plugins-good / a1204282ee7796984877b557e9bc6cbcfd178d7c / . / gst / monoscope / convolve.c

/* Karatsuba convolution | |

* | |

* Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz> | |

* | |

* This library is free software; you can redistribute it and/or | |

* modify it under the terms of the GNU Library General Public | |

* License as published by the Free Software Foundation; either | |

* version 2 of the License, or (at your option) any later version. | |

* | |

* This library is distributed in the hope that it will be useful, | |

* but WITHOUT ANY WARRANTY; without even the implied warranty of | |

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |

* Library General Public License for more details. | |

* | |

* You should have received a copy of the GNU Library General Public | |

* License along with this library; if not, write to the | |

* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, | |

* Boston, MA 02110-1301, USA. | |

* | |

* | |

* Note: 7th December 2004: This file used to be licensed under the GPL, | |

* but we got permission from Ralp Loader to relicense it to LGPL. | |

* | |

* $Id$ | |

* | |

*/ | |

/* The algorithm is based on the following. For the convolution of a pair | |

* of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four | |

* multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c - | |

* b.d. A similar relation enables us to compute a 2n by 2n convolution | |

* using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n | |

* multiplications (as opposed to the 4^n that the quadratic algorithm | |

* takes. */ | |

/* For large n, this is slower than the O(n log n) that the FFT method | |

* takes, but we avoid using complex numbers, and we only have to compute | |

* one convolution, as opposed to 3 FFTs. We have good locality-of- | |

* reference as well, which will help on CPUs with tiny caches. */ | |

/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160 | |

* (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba | |

* algorithm. We actually want 257 outputs of a 256 x 512 convolution; | |

* that doesn't appear to give an easy advantage for the FFT algorithm, but | |

* for the Karatsuba algorithm, it's easy to use two 256 x 256 | |

* convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference | |

* is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n, | |

* while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n | |

* - 1]. */ | |

/* There's a big lie above, actually... for a 4x4 convolution, it's quicker | |

* to do it using 16 multiplications than the more complex Karatsuba | |

* algorithm... So the recursion bottoms out at 4x4s. This increases the | |

* number of multiplications by a factor of 16/9, but reduces the overheads | |

* dramatically. */ | |

/* The convolution algorithm is implemented as a stack machine. We have a | |

* stack of commands, each in one of the forms "do a 2^n x 2^n | |

* convolution", or "combine these three length 2^n outputs into one | |

* 2^{n+1} output." */ | |

#ifdef HAVE_CONFIG_H | |

#include "config.h" | |

#endif | |

#include <stdlib.h> | |

#include "convolve.h" | |

typedef union stack_entry_s | |

{ | |

struct | |

{ | |

const double *left, *right; | |

double *out; | |

} | |

v; | |

struct | |

{ | |

double *main, *null; | |

} | |

b; | |

} | |

stack_entry; | |

struct _struct_convolve_state | |

{ | |

int depth, small, big, stack_size; | |

double *left; | |

double *right; | |

double *scratch; | |

stack_entry *stack; | |

}; | |

/* | |

* Initialisation routine - sets up tables and space to work in. | |

* Returns a pointer to internal state, to be used when performing calls. | |

* On error, returns NULL. | |

* The pointer should be freed when it is finished with, by convolve_close(). | |

*/ | |

convolve_state * | |

convolve_init (int depth) | |

{ | |

convolve_state *state; | |

state = malloc (sizeof (convolve_state)); | |

state->depth = depth; | |

state->small = (1 << depth); | |

state->big = (2 << depth); | |

state->stack_size = depth * 3; | |

state->left = calloc (state->big, sizeof (double)); | |

state->right = calloc (state->small * 3, sizeof (double)); | |

state->scratch = calloc (state->small * 3, sizeof (double)); | |

state->stack = calloc (state->stack_size + 1, sizeof (stack_entry)); | |

return state; | |

} | |

/* | |

* Free the state allocated with convolve_init(). | |

*/ | |

void | |

convolve_close (convolve_state * state) | |

{ | |

free (state->left); | |

free (state->right); | |

free (state->scratch); | |

free (state->stack); | |

free (state); | |

} | |

static void | |

convolve_4 (double *out, const double *left, const double *right) | |

/* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd | |

* ordering gives about a 1% speed up on my Pentium II. */ | |

{ | |

double l0, l1, l2, l3, r0, r1, r2, r3; | |

double a; | |

l0 = left[0]; | |

r0 = right[0]; | |

a = l0 * r0; | |

l1 = left[1]; | |

r1 = right[1]; | |

out[0] = a; | |

a = (l0 * r1) + (l1 * r0); | |

l2 = left[2]; | |

r2 = right[2]; | |

out[1] = a; | |

a = (l0 * r2) + (l1 * r1) + (l2 * r0); | |

l3 = left[3]; | |

r3 = right[3]; | |

out[2] = a; | |

out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0); | |

out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1); | |

out[5] = (l2 * r3) + (l3 * r2); | |

out[6] = l3 * r3; | |

} | |

static void | |

convolve_run (stack_entry * top, unsigned size, double *scratch) | |

/* Interpret a stack of commands. The stack starts with two entries; the | |

* convolution to do, and an illegal entry used to mark the stack top. The | |

* size is the number of entries in each input, and must be a power of 2, | |

* and at least 8. It is OK to have out equal to left and/or right. | |

* scratch must have length 3*size. The number of stack entries needed is | |

* 3n-4 where size=2^n. */ | |

{ | |

do { | |

const double *left; | |

const double *right; | |

double *out; | |

/* When we get here, the stack top is always a convolve, | |

* with size > 4. So we will split it. We repeatedly split | |

* the top entry until we get to size = 4. */ | |

left = top->v.left; | |

right = top->v.right; | |

out = top->v.out; | |

top++; | |

do { | |

double *s_left, *s_right; | |

int i; | |

/* Halve the size. */ | |

size >>= 1; | |

/* Allocate the scratch areas. */ | |

s_left = scratch + size * 3; | |

/* s_right is a length 2*size buffer also used for | |

* intermediate output. */ | |

s_right = scratch + size * 4; | |

/* Create the intermediate factors. */ | |

for (i = 0; i < size; i++) { | |

double l = left[i] + left[i + size]; | |

double r = right[i] + right[i + size]; | |

s_left[i + size] = r; | |

s_left[i] = l; | |

} | |

/* Push the combine entry onto the stack. */ | |

top -= 3; | |

top[2].b.main = out; | |

top[2].b.null = NULL; | |

/* Push the low entry onto the stack. This must be | |

* the last of the three sub-convolutions, because | |

* it may overwrite the arguments. */ | |

top[1].v.left = left; | |

top[1].v.right = right; | |

top[1].v.out = out; | |

/* Push the mid entry onto the stack. */ | |

top[0].v.left = s_left; | |

top[0].v.right = s_right; | |

top[0].v.out = s_right; | |

/* Leave the high entry in variables. */ | |

left += size; | |

right += size; | |

out += size * 2; | |

} while (size > 4); | |

/* When we get here, the stack top is a group of 3 | |

* convolves, with size = 4, followed by some combines. */ | |

convolve_4 (out, left, right); | |

convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right); | |

convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right); | |

top += 2; | |

/* Now process combines. */ | |

do { | |

/* b.main is the output buffer, mid is the middle | |

* part which needs to be adjusted in place, and | |

* then folded back into the output. We do this in | |

* a slightly strange way, so as to avoid having | |

* two loops. */ | |

double *out = top->b.main; | |

double *mid = scratch + size * 4; | |

unsigned int i; | |

top++; | |

out[size * 2 - 1] = 0; | |

for (i = 0; i < size - 1; i++) { | |

double lo; | |

double hi; | |

lo = mid[0] - (out[0] + out[2 * size]) + out[size]; | |

hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size]; | |

out[size] = lo; | |

out[2 * size] = hi; | |

out++; | |

mid++; | |

} | |

size <<= 1; | |

} while (top->b.null == NULL); | |

} while (top->b.main != NULL); | |

} | |

/* | |

* convolve_match: | |

* @lastchoice: an array of size SMALL. | |

* @input: an array of size BIG (2*SMALL) | |

* @state: a (non-NULL) pointer returned by convolve_init. | |

* | |

* We find the contiguous SMALL-size sub-array of input that best matches | |

* lastchoice. A measure of how good a sub-array is compared with the lastchoice | |

* is given by the sum of the products of each pair of entries. We maximise | |

* that, by taking an appropriate convolution, and then finding the maximum | |

* entry in the convolutions. | |

* | |

* Return: the position of the best match | |

*/ | |

int | |

convolve_match (const int *lastchoice, const short *input, | |

convolve_state * state) | |

{ | |

double avg = 0; | |

double best; | |

int p = 0; | |

int i; | |

double *left = state->left; | |

double *right = state->right; | |

double *scratch = state->scratch; | |

stack_entry *top = state->stack + (state->stack_size - 1); | |

for (i = 0; i < state->big; i++) | |

left[i] = input[i]; | |

for (i = 0; i < state->small; i++) { | |

double a = lastchoice[(state->small - 1) - i]; | |

right[i] = a; | |

avg += a; | |

} | |

/* We adjust the smaller of the two input arrays to have average | |

* value 0. This makes the eventual result insensitive to both | |

* constant offsets and positive multipliers of the inputs. */ | |

avg /= state->small; | |

for (i = 0; i < state->small; i++) | |

right[i] -= avg; | |

/* End-of-stack marker. */ | |

top[1].b.null = scratch; | |

top[1].b.main = NULL; | |

/* The low (small x small) part, of which we want the high outputs. */ | |

top->v.left = left; | |

top->v.right = right; | |

top->v.out = right + state->small; | |

convolve_run (top, state->small, scratch); | |

/* The high (small x small) part, of which we want the low outputs. */ | |

top->v.left = left + state->small; | |

top->v.right = right; | |

top->v.out = right; | |

convolve_run (top, state->small, scratch); | |

/* Now find the best position amoungs this. Apart from the first | |

* and last, the required convolution outputs are formed by adding | |

* outputs from the two convolutions above. */ | |

best = right[state->big - 1]; | |

right[state->big + state->small - 1] = 0; | |

p = -1; | |

for (i = 0; i < state->small; i++) { | |

double a = right[i] + right[i + state->big]; | |

if (a > best) { | |

best = a; | |

p = i; | |

} | |

} | |

p++; | |

#if 0 | |

{ | |

/* This is some debugging code... */ | |

best = 0; | |

for (i = 0; i < state->small; i++) | |

best += ((double) input[i + p]) * ((double) lastchoice[i] - avg); | |

for (i = 0; i <= state->small; i++) { | |

double tot = 0; | |

unsigned int j; | |

for (j = 0; j < state->small; j++) | |

tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg); | |

if (tot > best) | |

printf ("(%i)", i); | |

if (tot != left[i + (state->small - 1)]) | |

printf ("!"); | |

} | |

printf ("%i\n", p); | |

} | |

#endif | |

return p; | |

} |