/*
 * Copyright (c) 2011-2017, Peter Abeles. All Rights Reserved.
 *
 * This file is part of BoofCV (http://boofcv.org).
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package boofcv.alg.filter.convolve.down;

import boofcv.struct.convolve.Kernel1D_S32;
import boofcv.struct.convolve.Kernel2D_S32;
import boofcv.struct.image.GrayI16;
import boofcv.struct.image.GrayS16;

/**
 * <p>
 * Unrolls the convolution kernel to improve runtime performance by reducing array accesses.
 * </p>
 * 
 * <p>
 * DO NOT MODIFY: This class was automatically generated by {@link GenerateConvolveDownNoBorderUnrolled}.
 * </p>
 *
 * @author Peter Abeles
 */
public class ConvolveDownNoBorderUnrolled_S16_I16 {
	public static boolean horizontal(Kernel1D_S32 kernel ,
									 GrayS16 image, GrayI16 dest , int skip) {
		switch( kernel.width ) {
			case 3:
				horizontal3(kernel,image,dest,skip);
				break;

			case 5:
				horizontal5(kernel,image,dest,skip);
				break;

			case 7:
				horizontal7(kernel,image,dest,skip);
				break;

			case 9:
				horizontal9(kernel,image,dest,skip);
				break;

			case 11:
				horizontal11(kernel,image,dest,skip);
				break;

			default:
				return false;
		}
		return true;
	}

	public static boolean vertical(Kernel1D_S32 kernel ,
								   GrayS16 image, GrayI16 dest , int skip) {
		switch( kernel.width ) {
			case 3:
				vertical3(kernel,image,dest,skip);
				break;

			case 5:
				vertical5(kernel,image,dest,skip);
				break;

			case 7:
				vertical7(kernel,image,dest,skip);
				break;

			case 9:
				vertical9(kernel,image,dest,skip);
				break;

			case 11:
				vertical11(kernel,image,dest,skip);
				break;

			default:
				return false;
		}
		return true;
	}

	public static boolean convolve(Kernel2D_S32 kernel ,
								   GrayS16 image, GrayI16 dest , int skip ) {
		switch( kernel.width ) {
			case 3:
				convolve3(kernel,image,dest,skip);
				break;

			case 5:
				convolve5(kernel,image,dest,skip);
				break;

			case 7:
				convolve7(kernel,image,dest,skip);
				break;

			case 9:
				convolve9(kernel,image,dest,skip);
				break;

			case 11:
				convolve11(kernel,image,dest,skip);
				break;

			default:
				return false;
		}
		return true;
	}

	public static void horizontal3(Kernel1D_S32 kernel ,
								   GrayS16 input, GrayI16 output ,
								   int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];

		final int radius = kernel.getRadius();

		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int height = input.getHeight();

		final int offsetX = UtilDownConvolve.computeOffset(skip,radius);

		for( int i = 0; i < height; i++ ) {
			int indexDst = output.startIndex + i*output.stride + offsetX/skip;
			int j = input.startIndex + i*input.stride - radius;
			final int jEnd = j+widthEnd;

			for( j += offsetX; j <= jEnd; j += skip ) {
				int indexSrc = j;

				int total = (dataSrc[indexSrc++] ) * k1;
				total += (dataSrc[indexSrc++])*k2;
				total += (dataSrc[indexSrc])*k3;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void horizontal5(Kernel1D_S32 kernel ,
								   GrayS16 input, GrayI16 output ,
								   int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];

		final int radius = kernel.getRadius();

		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int height = input.getHeight();

		final int offsetX = UtilDownConvolve.computeOffset(skip,radius);

		for( int i = 0; i < height; i++ ) {
			int indexDst = output.startIndex + i*output.stride + offsetX/skip;
			int j = input.startIndex + i*input.stride - radius;
			final int jEnd = j+widthEnd;

			for( j += offsetX; j <= jEnd; j += skip ) {
				int indexSrc = j;

				int total = (dataSrc[indexSrc++] ) * k1;
				total += (dataSrc[indexSrc++])*k2;
				total += (dataSrc[indexSrc++])*k3;
				total += (dataSrc[indexSrc++])*k4;
				total += (dataSrc[indexSrc])*k5;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void horizontal7(Kernel1D_S32 kernel ,
								   GrayS16 input, GrayI16 output ,
								   int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];
		final int k6 = kernel.data[5];
		final int k7 = kernel.data[6];

		final int radius = kernel.getRadius();

		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int height = input.getHeight();

		final int offsetX = UtilDownConvolve.computeOffset(skip,radius);

		for( int i = 0; i < height; i++ ) {
			int indexDst = output.startIndex + i*output.stride + offsetX/skip;
			int j = input.startIndex + i*input.stride - radius;
			final int jEnd = j+widthEnd;

			for( j += offsetX; j <= jEnd; j += skip ) {
				int indexSrc = j;

				int total = (dataSrc[indexSrc++] ) * k1;
				total += (dataSrc[indexSrc++])*k2;
				total += (dataSrc[indexSrc++])*k3;
				total += (dataSrc[indexSrc++])*k4;
				total += (dataSrc[indexSrc++])*k5;
				total += (dataSrc[indexSrc++])*k6;
				total += (dataSrc[indexSrc])*k7;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void horizontal9(Kernel1D_S32 kernel ,
								   GrayS16 input, GrayI16 output ,
								   int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];
		final int k6 = kernel.data[5];
		final int k7 = kernel.data[6];
		final int k8 = kernel.data[7];
		final int k9 = kernel.data[8];

		final int radius = kernel.getRadius();

		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int height = input.getHeight();

		final int offsetX = UtilDownConvolve.computeOffset(skip,radius);

		for( int i = 0; i < height; i++ ) {
			int indexDst = output.startIndex + i*output.stride + offsetX/skip;
			int j = input.startIndex + i*input.stride - radius;
			final int jEnd = j+widthEnd;

			for( j += offsetX; j <= jEnd; j += skip ) {
				int indexSrc = j;

				int total = (dataSrc[indexSrc++] ) * k1;
				total += (dataSrc[indexSrc++])*k2;
				total += (dataSrc[indexSrc++])*k3;
				total += (dataSrc[indexSrc++])*k4;
				total += (dataSrc[indexSrc++])*k5;
				total += (dataSrc[indexSrc++])*k6;
				total += (dataSrc[indexSrc++])*k7;
				total += (dataSrc[indexSrc++])*k8;
				total += (dataSrc[indexSrc])*k9;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void horizontal11(Kernel1D_S32 kernel ,
									GrayS16 input, GrayI16 output ,
									int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];
		final int k6 = kernel.data[5];
		final int k7 = kernel.data[6];
		final int k8 = kernel.data[7];
		final int k9 = kernel.data[8];
		final int k10 = kernel.data[9];
		final int k11 = kernel.data[10];

		final int radius = kernel.getRadius();

		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int height = input.getHeight();

		final int offsetX = UtilDownConvolve.computeOffset(skip,radius);

		for( int i = 0; i < height; i++ ) {
			int indexDst = output.startIndex + i*output.stride + offsetX/skip;
			int j = input.startIndex + i*input.stride - radius;
			final int jEnd = j+widthEnd;

			for( j += offsetX; j <= jEnd; j += skip ) {
				int indexSrc = j;

				int total = (dataSrc[indexSrc++] ) * k1;
				total += (dataSrc[indexSrc++])*k2;
				total += (dataSrc[indexSrc++])*k3;
				total += (dataSrc[indexSrc++])*k4;
				total += (dataSrc[indexSrc++])*k5;
				total += (dataSrc[indexSrc++])*k6;
				total += (dataSrc[indexSrc++])*k7;
				total += (dataSrc[indexSrc++])*k8;
				total += (dataSrc[indexSrc++])*k9;
				total += (dataSrc[indexSrc++])*k10;
				total += (dataSrc[indexSrc])*k11;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void vertical3(Kernel1D_S32 kernel,
								 GrayS16 input, GrayI16 output,
								 int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];

		final int radius = kernel.getRadius();

		final int width = input.width;
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offsetY = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offsetY; y <= heightEnd; y += skip ) {
			int indexDst = output.startIndex + (y/skip)*output.stride;
			int i = input.startIndex + (y-radius)*input.stride;
			final int iEnd = i + width;

			for( ; i < iEnd; i++ ) {
				int indexSrc = i;
				int total = (dataSrc[indexSrc] )*k1;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k2;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k3;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void vertical5(Kernel1D_S32 kernel,
								 GrayS16 input, GrayI16 output,
								 int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];

		final int radius = kernel.getRadius();

		final int width = input.width;
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offsetY = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offsetY; y <= heightEnd; y += skip ) {
			int indexDst = output.startIndex + (y/skip)*output.stride;
			int i = input.startIndex + (y-radius)*input.stride;
			final int iEnd = i + width;

			for( ; i < iEnd; i++ ) {
				int indexSrc = i;
				int total = (dataSrc[indexSrc] )*k1;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k2;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k3;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k4;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k5;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void vertical7(Kernel1D_S32 kernel,
								 GrayS16 input, GrayI16 output,
								 int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];
		final int k6 = kernel.data[5];
		final int k7 = kernel.data[6];

		final int radius = kernel.getRadius();

		final int width = input.width;
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offsetY = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offsetY; y <= heightEnd; y += skip ) {
			int indexDst = output.startIndex + (y/skip)*output.stride;
			int i = input.startIndex + (y-radius)*input.stride;
			final int iEnd = i + width;

			for( ; i < iEnd; i++ ) {
				int indexSrc = i;
				int total = (dataSrc[indexSrc] )*k1;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k2;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k3;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k4;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k5;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k6;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k7;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void vertical9(Kernel1D_S32 kernel,
								 GrayS16 input, GrayI16 output,
								 int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];
		final int k6 = kernel.data[5];
		final int k7 = kernel.data[6];
		final int k8 = kernel.data[7];
		final int k9 = kernel.data[8];

		final int radius = kernel.getRadius();

		final int width = input.width;
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offsetY = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offsetY; y <= heightEnd; y += skip ) {
			int indexDst = output.startIndex + (y/skip)*output.stride;
			int i = input.startIndex + (y-radius)*input.stride;
			final int iEnd = i + width;

			for( ; i < iEnd; i++ ) {
				int indexSrc = i;
				int total = (dataSrc[indexSrc] )*k1;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k2;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k3;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k4;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k5;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k6;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k7;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k8;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k9;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void vertical11(Kernel1D_S32 kernel,
								  GrayS16 input, GrayI16 output,
								  int skip ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int k1 = kernel.data[0];
		final int k2 = kernel.data[1];
		final int k3 = kernel.data[2];
		final int k4 = kernel.data[3];
		final int k5 = kernel.data[4];
		final int k6 = kernel.data[5];
		final int k7 = kernel.data[6];
		final int k8 = kernel.data[7];
		final int k9 = kernel.data[8];
		final int k10 = kernel.data[9];
		final int k11 = kernel.data[10];

		final int radius = kernel.getRadius();

		final int width = input.width;
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offsetY = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offsetY; y <= heightEnd; y += skip ) {
			int indexDst = output.startIndex + (y/skip)*output.stride;
			int i = input.startIndex + (y-radius)*input.stride;
			final int iEnd = i + width;

			for( ; i < iEnd; i++ ) {
				int indexSrc = i;
				int total = (dataSrc[indexSrc] )*k1;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k2;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k3;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k4;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k5;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k6;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k7;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k8;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k9;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k10;
				indexSrc += input.stride;
				total += (dataSrc[indexSrc])*k11;

				dataDst[indexDst++] = ( short )total;
			}
		}
	}

	public static void convolve3(Kernel2D_S32 kernel, GrayS16 input, GrayI16 output, int skip )
	{
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int radius = kernel.getRadius();
		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offset = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offset; y <= heightEnd; y += skip) {

			// first time through the value needs to be set
			int k1 = kernel.data[0];
			int k2 = kernel.data[1];
			int k3 = kernel.data[2];

			int indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
			int indexSrcRow = input.startIndex + (y-radius)*input.stride - radius;
			for( int x = offset; x <= widthEnd; x += skip ) {
				int indexSrc = indexSrcRow + x;

				int total = 0;
				total += (dataSrc[indexSrc++] )* k1;
				total += (dataSrc[indexSrc++] )* k2;
				total += (dataSrc[indexSrc] )* k3;

				dataDst[indexDst++] = ( short )total;
			}

			// rest of the convolution rows are an addition
			for( int i = 1; i < 3; i++ ) {
				indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
				indexSrcRow = input.startIndex + (y+i-radius)*input.stride - radius;
				
				k1 = kernel.data[i*3 + 0];
				k2 = kernel.data[i*3 + 1];
				k3 = kernel.data[i*3 + 2];

				for( int x = offset; x <= widthEnd; x += skip ) {
					int indexSrc = indexSrcRow+x;

					int total = 0;
					total += (dataSrc[indexSrc++] )* k1;
					total += (dataSrc[indexSrc++] )* k2;
					total += (dataSrc[indexSrc] )* k3;

					dataDst[indexDst++] += ( short )total;
				}
			}
		}
	}

	public static void convolve5(Kernel2D_S32 kernel, GrayS16 input, GrayI16 output, int skip )
	{
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int radius = kernel.getRadius();
		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offset = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offset; y <= heightEnd; y += skip) {

			// first time through the value needs to be set
			int k1 = kernel.data[0];
			int k2 = kernel.data[1];
			int k3 = kernel.data[2];
			int k4 = kernel.data[3];
			int k5 = kernel.data[4];

			int indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
			int indexSrcRow = input.startIndex + (y-radius)*input.stride - radius;
			for( int x = offset; x <= widthEnd; x += skip ) {
				int indexSrc = indexSrcRow + x;

				int total = 0;
				total += (dataSrc[indexSrc++] )* k1;
				total += (dataSrc[indexSrc++] )* k2;
				total += (dataSrc[indexSrc++] )* k3;
				total += (dataSrc[indexSrc++] )* k4;
				total += (dataSrc[indexSrc] )* k5;

				dataDst[indexDst++] = ( short )total;
			}

			// rest of the convolution rows are an addition
			for( int i = 1; i < 5; i++ ) {
				indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
				indexSrcRow = input.startIndex + (y+i-radius)*input.stride - radius;
				
				k1 = kernel.data[i*5 + 0];
				k2 = kernel.data[i*5 + 1];
				k3 = kernel.data[i*5 + 2];
				k4 = kernel.data[i*5 + 3];
				k5 = kernel.data[i*5 + 4];

				for( int x = offset; x <= widthEnd; x += skip ) {
					int indexSrc = indexSrcRow+x;

					int total = 0;
					total += (dataSrc[indexSrc++] )* k1;
					total += (dataSrc[indexSrc++] )* k2;
					total += (dataSrc[indexSrc++] )* k3;
					total += (dataSrc[indexSrc++] )* k4;
					total += (dataSrc[indexSrc] )* k5;

					dataDst[indexDst++] += ( short )total;
				}
			}
		}
	}

	public static void convolve7(Kernel2D_S32 kernel, GrayS16 input, GrayI16 output, int skip )
	{
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int radius = kernel.getRadius();
		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offset = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offset; y <= heightEnd; y += skip) {

			// first time through the value needs to be set
			int k1 = kernel.data[0];
			int k2 = kernel.data[1];
			int k3 = kernel.data[2];
			int k4 = kernel.data[3];
			int k5 = kernel.data[4];
			int k6 = kernel.data[5];
			int k7 = kernel.data[6];

			int indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
			int indexSrcRow = input.startIndex + (y-radius)*input.stride - radius;
			for( int x = offset; x <= widthEnd; x += skip ) {
				int indexSrc = indexSrcRow + x;

				int total = 0;
				total += (dataSrc[indexSrc++] )* k1;
				total += (dataSrc[indexSrc++] )* k2;
				total += (dataSrc[indexSrc++] )* k3;
				total += (dataSrc[indexSrc++] )* k4;
				total += (dataSrc[indexSrc++] )* k5;
				total += (dataSrc[indexSrc++] )* k6;
				total += (dataSrc[indexSrc] )* k7;

				dataDst[indexDst++] = ( short )total;
			}

			// rest of the convolution rows are an addition
			for( int i = 1; i < 7; i++ ) {
				indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
				indexSrcRow = input.startIndex + (y+i-radius)*input.stride - radius;
				
				k1 = kernel.data[i*7 + 0];
				k2 = kernel.data[i*7 + 1];
				k3 = kernel.data[i*7 + 2];
				k4 = kernel.data[i*7 + 3];
				k5 = kernel.data[i*7 + 4];
				k6 = kernel.data[i*7 + 5];
				k7 = kernel.data[i*7 + 6];

				for( int x = offset; x <= widthEnd; x += skip ) {
					int indexSrc = indexSrcRow+x;

					int total = 0;
					total += (dataSrc[indexSrc++] )* k1;
					total += (dataSrc[indexSrc++] )* k2;
					total += (dataSrc[indexSrc++] )* k3;
					total += (dataSrc[indexSrc++] )* k4;
					total += (dataSrc[indexSrc++] )* k5;
					total += (dataSrc[indexSrc++] )* k6;
					total += (dataSrc[indexSrc] )* k7;

					dataDst[indexDst++] += ( short )total;
				}
			}
		}
	}

	public static void convolve9(Kernel2D_S32 kernel, GrayS16 input, GrayI16 output, int skip )
	{
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int radius = kernel.getRadius();
		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offset = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offset; y <= heightEnd; y += skip) {

			// first time through the value needs to be set
			int k1 = kernel.data[0];
			int k2 = kernel.data[1];
			int k3 = kernel.data[2];
			int k4 = kernel.data[3];
			int k5 = kernel.data[4];
			int k6 = kernel.data[5];
			int k7 = kernel.data[6];
			int k8 = kernel.data[7];
			int k9 = kernel.data[8];

			int indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
			int indexSrcRow = input.startIndex + (y-radius)*input.stride - radius;
			for( int x = offset; x <= widthEnd; x += skip ) {
				int indexSrc = indexSrcRow + x;

				int total = 0;
				total += (dataSrc[indexSrc++] )* k1;
				total += (dataSrc[indexSrc++] )* k2;
				total += (dataSrc[indexSrc++] )* k3;
				total += (dataSrc[indexSrc++] )* k4;
				total += (dataSrc[indexSrc++] )* k5;
				total += (dataSrc[indexSrc++] )* k6;
				total += (dataSrc[indexSrc++] )* k7;
				total += (dataSrc[indexSrc++] )* k8;
				total += (dataSrc[indexSrc] )* k9;

				dataDst[indexDst++] = ( short )total;
			}

			// rest of the convolution rows are an addition
			for( int i = 1; i < 9; i++ ) {
				indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
				indexSrcRow = input.startIndex + (y+i-radius)*input.stride - radius;
				
				k1 = kernel.data[i*9 + 0];
				k2 = kernel.data[i*9 + 1];
				k3 = kernel.data[i*9 + 2];
				k4 = kernel.data[i*9 + 3];
				k5 = kernel.data[i*9 + 4];
				k6 = kernel.data[i*9 + 5];
				k7 = kernel.data[i*9 + 6];
				k8 = kernel.data[i*9 + 7];
				k9 = kernel.data[i*9 + 8];

				for( int x = offset; x <= widthEnd; x += skip ) {
					int indexSrc = indexSrcRow+x;

					int total = 0;
					total += (dataSrc[indexSrc++] )* k1;
					total += (dataSrc[indexSrc++] )* k2;
					total += (dataSrc[indexSrc++] )* k3;
					total += (dataSrc[indexSrc++] )* k4;
					total += (dataSrc[indexSrc++] )* k5;
					total += (dataSrc[indexSrc++] )* k6;
					total += (dataSrc[indexSrc++] )* k7;
					total += (dataSrc[indexSrc++] )* k8;
					total += (dataSrc[indexSrc] )* k9;

					dataDst[indexDst++] += ( short )total;
				}
			}
		}
	}

	public static void convolve11(Kernel2D_S32 kernel, GrayS16 input, GrayI16 output, int skip )
	{
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int radius = kernel.getRadius();
		final int widthEnd = UtilDownConvolve.computeMaxSide(input.width,skip,radius);
		final int heightEnd = UtilDownConvolve.computeMaxSide(input.height,skip,radius);

		final int offset = UtilDownConvolve.computeOffset(skip,radius);

		for( int y = offset; y <= heightEnd; y += skip) {

			// first time through the value needs to be set
			int k1 = kernel.data[0];
			int k2 = kernel.data[1];
			int k3 = kernel.data[2];
			int k4 = kernel.data[3];
			int k5 = kernel.data[4];
			int k6 = kernel.data[5];
			int k7 = kernel.data[6];
			int k8 = kernel.data[7];
			int k9 = kernel.data[8];
			int k10 = kernel.data[9];
			int k11 = kernel.data[10];

			int indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
			int indexSrcRow = input.startIndex + (y-radius)*input.stride - radius;
			for( int x = offset; x <= widthEnd; x += skip ) {
				int indexSrc = indexSrcRow + x;

				int total = 0;
				total += (dataSrc[indexSrc++] )* k1;
				total += (dataSrc[indexSrc++] )* k2;
				total += (dataSrc[indexSrc++] )* k3;
				total += (dataSrc[indexSrc++] )* k4;
				total += (dataSrc[indexSrc++] )* k5;
				total += (dataSrc[indexSrc++] )* k6;
				total += (dataSrc[indexSrc++] )* k7;
				total += (dataSrc[indexSrc++] )* k8;
				total += (dataSrc[indexSrc++] )* k9;
				total += (dataSrc[indexSrc++] )* k10;
				total += (dataSrc[indexSrc] )* k11;

				dataDst[indexDst++] = ( short )total;
			}

			// rest of the convolution rows are an addition
			for( int i = 1; i < 11; i++ ) {
				indexDst = output.startIndex + (y/skip)*output.stride + offset/skip;
				indexSrcRow = input.startIndex + (y+i-radius)*input.stride - radius;
				
				k1 = kernel.data[i*11 + 0];
				k2 = kernel.data[i*11 + 1];
				k3 = kernel.data[i*11 + 2];
				k4 = kernel.data[i*11 + 3];
				k5 = kernel.data[i*11 + 4];
				k6 = kernel.data[i*11 + 5];
				k7 = kernel.data[i*11 + 6];
				k8 = kernel.data[i*11 + 7];
				k9 = kernel.data[i*11 + 8];
				k10 = kernel.data[i*11 + 9];
				k11 = kernel.data[i*11 + 10];

				for( int x = offset; x <= widthEnd; x += skip ) {
					int indexSrc = indexSrcRow+x;

					int total = 0;
					total += (dataSrc[indexSrc++] )* k1;
					total += (dataSrc[indexSrc++] )* k2;
					total += (dataSrc[indexSrc++] )* k3;
					total += (dataSrc[indexSrc++] )* k4;
					total += (dataSrc[indexSrc++] )* k5;
					total += (dataSrc[indexSrc++] )* k6;
					total += (dataSrc[indexSrc++] )* k7;
					total += (dataSrc[indexSrc++] )* k8;
					total += (dataSrc[indexSrc++] )* k9;
					total += (dataSrc[indexSrc++] )* k10;
					total += (dataSrc[indexSrc] )* k11;

					dataDst[indexDst++] += ( short )total;
				}
			}
		}
	}

}
