/*
 * Copyright (c) 2011-2019, Peter Abeles. All Rights Reserved.
 *
 * This file is part of BoofCV (http://boofcv.org).
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package boofcv.alg.filter.convolve.noborder;

import boofcv.concurrency.BoofConcurrency;
import boofcv.concurrency.DWorkArrays;
import boofcv.concurrency.FWorkArrays;
import boofcv.concurrency.IWorkArrays;
import boofcv.struct.image.*;

import javax.annotation.Generated;

/**
 * <p>
 * Convolves a mean filter across the image.  The mean value of all the pixels are computed inside the kernel.
 * </p>
 * <p>
 * Do not modify.  Auto generated by GenerateImplConvolveMean
 * </p>
 * 
 * @author Peter Abeles
 */
@Generated({"boofcv.alg.filter.convolve.noborder.GenerateImplConvolveMean"})
@SuppressWarnings({"ForLoopReplaceableByForEach","Duplicates"})
public class ImplConvolveMean_MT {

	public static void horizontal( GrayU8 input , GrayI8 output , int radius ) {
		final int kernelWidth = radius*2 + 1;

		final int divisor = kernelWidth;
		final int halfDivisor = divisor/2;

		BoofConcurrency.loopFor(0, input.height, y -> {
			int indexIn = input.startIndex + input.stride*y;
			int indexOut = output.startIndex + output.stride*y + radius;

			int total = 0;

			int indexEnd = indexIn + kernelWidth;
			
			for( ; indexIn < indexEnd; indexIn++ ) {
				total += input.data[indexIn] & 0xFF;
			}
			output.data[indexOut++] = (byte)((total+halfDivisor)/divisor);

			indexEnd = indexIn + input.width - kernelWidth;
			for( ; indexIn < indexEnd; indexIn++ ) {
				total -= input.data[ indexIn - kernelWidth ] & 0xFF;
				total += input.data[ indexIn ] & 0xFF;

				output.data[indexOut++] = (byte)((total+halfDivisor)/divisor);
			}
		});
	}

	public static void vertical(GrayU8 input , GrayI8 output , int radius, IWorkArrays work ) {
		if( work == null ) {
			work = new IWorkArrays(input.width);
		} else {
			work.reset(input.width);
		}
		final IWorkArrays _work = work;
		final int kernelWidth = radius*2 + 1;
		final int backStep = kernelWidth*input.stride;

		int divisor = kernelWidth;
		final int halfDivisor = divisor/2;

		// To reduce cache misses it is processed along rows instead of going down columns, which is
		// more natural for a vertical convolution. For parallel processes this requires building
		// a book keeping array for each thread.

		BoofConcurrency.loopBlocks(radius, output.height-radius, kernelWidth,(y0,y1)->{
		int totals[] = _work.pop();
		for( int x = 0; x < input.width; x++ ) {
			int indexIn = input.startIndex + (y0-radius)*input.stride + x;
			int indexOut = output.startIndex + output.stride*y0 + x;

			int total = 0;
			int indexEnd = indexIn + input.stride*kernelWidth;
			for( ; indexIn < indexEnd; indexIn += input.stride) {
				total += input.data[indexIn] & 0xFF;
			}
			totals[x] = total;
			output.data[indexOut] = (byte)((total+halfDivisor)/divisor);
		}

		// change the order it is processed in to reduce cache misses
		for( int y = y0+1; y < y1; y++ ) {
			int indexIn = input.startIndex + (y+radius)*input.stride;
			int indexOut = output.startIndex + y*output.stride;

			for( int x = 0; x < input.width; x++ ,indexIn++,indexOut++) {
				int total = totals[ x ]  - (input.data[ indexIn - backStep ]& 0xFF);
				totals[ x ] = total += input.data[ indexIn ]& 0xFF;

				output.data[indexOut] = (byte)((total+halfDivisor)/divisor);
			}
		}
		_work.recycle(totals);
		});
	}
	public static void horizontal( GrayS16 input , GrayI16 output , int radius ) {
		final int kernelWidth = radius*2 + 1;

		final int divisor = kernelWidth;
		final int halfDivisor = divisor/2;

		BoofConcurrency.loopFor(0, input.height, y -> {
			int indexIn = input.startIndex + input.stride*y;
			int indexOut = output.startIndex + output.stride*y + radius;

			int total = 0;

			int indexEnd = indexIn + kernelWidth;
			
			for( ; indexIn < indexEnd; indexIn++ ) {
				total += input.data[indexIn] ;
			}
			output.data[indexOut++] = (short)((total+halfDivisor)/divisor);

			indexEnd = indexIn + input.width - kernelWidth;
			for( ; indexIn < indexEnd; indexIn++ ) {
				total -= input.data[ indexIn - kernelWidth ] ;
				total += input.data[ indexIn ] ;

				output.data[indexOut++] = (short)((total+halfDivisor)/divisor);
			}
		});
	}

	public static void vertical(GrayS16 input , GrayI16 output , int radius, IWorkArrays work ) {
		if( work == null ) {
			work = new IWorkArrays(input.width);
		} else {
			work.reset(input.width);
		}
		final IWorkArrays _work = work;
		final int kernelWidth = radius*2 + 1;
		final int backStep = kernelWidth*input.stride;

		int divisor = kernelWidth;
		final int halfDivisor = divisor/2;

		// To reduce cache misses it is processed along rows instead of going down columns, which is
		// more natural for a vertical convolution. For parallel processes this requires building
		// a book keeping array for each thread.

		BoofConcurrency.loopBlocks(radius, output.height-radius, kernelWidth,(y0,y1)->{
		int totals[] = _work.pop();
		for( int x = 0; x < input.width; x++ ) {
			int indexIn = input.startIndex + (y0-radius)*input.stride + x;
			int indexOut = output.startIndex + output.stride*y0 + x;

			int total = 0;
			int indexEnd = indexIn + input.stride*kernelWidth;
			for( ; indexIn < indexEnd; indexIn += input.stride) {
				total += input.data[indexIn] ;
			}
			totals[x] = total;
			output.data[indexOut] = (short)((total+halfDivisor)/divisor);
		}

		// change the order it is processed in to reduce cache misses
		for( int y = y0+1; y < y1; y++ ) {
			int indexIn = input.startIndex + (y+radius)*input.stride;
			int indexOut = output.startIndex + y*output.stride;

			for( int x = 0; x < input.width; x++ ,indexIn++,indexOut++) {
				int total = totals[ x ]  - (input.data[ indexIn - backStep ]);
				totals[ x ] = total += input.data[ indexIn ];

				output.data[indexOut] = (short)((total+halfDivisor)/divisor);
			}
		}
		_work.recycle(totals);
		});
	}
	public static void horizontal( GrayU16 input , GrayI16 output , int radius ) {
		final int kernelWidth = radius*2 + 1;

		final int divisor = kernelWidth;
		final int halfDivisor = divisor/2;

		BoofConcurrency.loopFor(0, input.height, y -> {
			int indexIn = input.startIndex + input.stride*y;
			int indexOut = output.startIndex + output.stride*y + radius;

			int total = 0;

			int indexEnd = indexIn + kernelWidth;
			
			for( ; indexIn < indexEnd; indexIn++ ) {
				total += input.data[indexIn] & 0xFFFF;
			}
			output.data[indexOut++] = (short)((total+halfDivisor)/divisor);

			indexEnd = indexIn + input.width - kernelWidth;
			for( ; indexIn < indexEnd; indexIn++ ) {
				total -= input.data[ indexIn - kernelWidth ] & 0xFFFF;
				total += input.data[ indexIn ] & 0xFFFF;

				output.data[indexOut++] = (short)((total+halfDivisor)/divisor);
			}
		});
	}

	public static void vertical(GrayU16 input , GrayI16 output , int radius, IWorkArrays work ) {
		if( work == null ) {
			work = new IWorkArrays(input.width);
		} else {
			work.reset(input.width);
		}
		final IWorkArrays _work = work;
		final int kernelWidth = radius*2 + 1;
		final int backStep = kernelWidth*input.stride;

		int divisor = kernelWidth;
		final int halfDivisor = divisor/2;

		// To reduce cache misses it is processed along rows instead of going down columns, which is
		// more natural for a vertical convolution. For parallel processes this requires building
		// a book keeping array for each thread.

		BoofConcurrency.loopBlocks(radius, output.height-radius, kernelWidth,(y0,y1)->{
		int totals[] = _work.pop();
		for( int x = 0; x < input.width; x++ ) {
			int indexIn = input.startIndex + (y0-radius)*input.stride + x;
			int indexOut = output.startIndex + output.stride*y0 + x;

			int total = 0;
			int indexEnd = indexIn + input.stride*kernelWidth;
			for( ; indexIn < indexEnd; indexIn += input.stride) {
				total += input.data[indexIn] & 0xFFFF;
			}
			totals[x] = total;
			output.data[indexOut] = (short)((total+halfDivisor)/divisor);
		}

		// change the order it is processed in to reduce cache misses
		for( int y = y0+1; y < y1; y++ ) {
			int indexIn = input.startIndex + (y+radius)*input.stride;
			int indexOut = output.startIndex + y*output.stride;

			for( int x = 0; x < input.width; x++ ,indexIn++,indexOut++) {
				int total = totals[ x ]  - (input.data[ indexIn - backStep ]& 0xFFFF);
				totals[ x ] = total += input.data[ indexIn ]& 0xFFFF;

				output.data[indexOut] = (short)((total+halfDivisor)/divisor);
			}
		}
		_work.recycle(totals);
		});
	}
	public static void horizontal( GrayF32 input , GrayF32 output , int radius ) {
		final int kernelWidth = radius*2 + 1;

		final float divisor = kernelWidth;

		BoofConcurrency.loopFor(0, input.height, y -> {
			int indexIn = input.startIndex + input.stride*y;
			int indexOut = output.startIndex + output.stride*y + radius;

			float total = 0;

			int indexEnd = indexIn + kernelWidth;
			
			for( ; indexIn < indexEnd; indexIn++ ) {
				total += input.data[indexIn] ;
			}
			output.data[indexOut++] = (total/divisor);

			indexEnd = indexIn + input.width - kernelWidth;
			for( ; indexIn < indexEnd; indexIn++ ) {
				total -= input.data[ indexIn - kernelWidth ] ;
				total += input.data[ indexIn ] ;

				output.data[indexOut++] = (total/divisor);
			}
		});
	}

	public static void vertical(GrayF32 input , GrayF32 output , int radius, FWorkArrays work ) {
		if( work == null ) {
			work = new FWorkArrays(input.width);
		} else {
			work.reset(input.width);
		}
		final FWorkArrays _work = work;
		final int kernelWidth = radius*2 + 1;
		final int backStep = kernelWidth*input.stride;

		float divisor = kernelWidth;

		// To reduce cache misses it is processed along rows instead of going down columns, which is
		// more natural for a vertical convolution. For parallel processes this requires building
		// a book keeping array for each thread.

		BoofConcurrency.loopBlocks(radius, output.height-radius, kernelWidth,(y0,y1)->{
		float totals[] = _work.pop();
		for( int x = 0; x < input.width; x++ ) {
			int indexIn = input.startIndex + (y0-radius)*input.stride + x;
			int indexOut = output.startIndex + output.stride*y0 + x;

			float total = 0;
			int indexEnd = indexIn + input.stride*kernelWidth;
			for( ; indexIn < indexEnd; indexIn += input.stride) {
				total += input.data[indexIn] ;
			}
			totals[x] = total;
			output.data[indexOut] = (total/divisor);
		}

		// change the order it is processed in to reduce cache misses
		for( int y = y0+1; y < y1; y++ ) {
			int indexIn = input.startIndex + (y+radius)*input.stride;
			int indexOut = output.startIndex + y*output.stride;

			for( int x = 0; x < input.width; x++ ,indexIn++,indexOut++) {
				float total = totals[ x ]  - (input.data[ indexIn - backStep ]);
				totals[ x ] = total += input.data[ indexIn ];

				output.data[indexOut] = (total/divisor);
			}
		}
		_work.recycle(totals);
		});
	}
	public static void horizontal( GrayF64 input , GrayF64 output , int radius ) {
		final int kernelWidth = radius*2 + 1;

		final double divisor = kernelWidth;

		BoofConcurrency.loopFor(0, input.height, y -> {
			int indexIn = input.startIndex + input.stride*y;
			int indexOut = output.startIndex + output.stride*y + radius;

			double total = 0;

			int indexEnd = indexIn + kernelWidth;
			
			for( ; indexIn < indexEnd; indexIn++ ) {
				total += input.data[indexIn] ;
			}
			output.data[indexOut++] = (total/divisor);

			indexEnd = indexIn + input.width - kernelWidth;
			for( ; indexIn < indexEnd; indexIn++ ) {
				total -= input.data[ indexIn - kernelWidth ] ;
				total += input.data[ indexIn ] ;

				output.data[indexOut++] = (total/divisor);
			}
		});
	}

	public static void vertical(GrayF64 input , GrayF64 output , int radius, DWorkArrays work ) {
		if( work == null ) {
			work = new DWorkArrays(input.width);
		} else {
			work.reset(input.width);
		}
		final DWorkArrays _work = work;
		final int kernelWidth = radius*2 + 1;
		final int backStep = kernelWidth*input.stride;

		double divisor = kernelWidth;

		// To reduce cache misses it is processed along rows instead of going down columns, which is
		// more natural for a vertical convolution. For parallel processes this requires building
		// a book keeping array for each thread.

		BoofConcurrency.loopBlocks(radius, output.height-radius, kernelWidth,(y0,y1)->{
		double totals[] = _work.pop();
		for( int x = 0; x < input.width; x++ ) {
			int indexIn = input.startIndex + (y0-radius)*input.stride + x;
			int indexOut = output.startIndex + output.stride*y0 + x;

			double total = 0;
			int indexEnd = indexIn + input.stride*kernelWidth;
			for( ; indexIn < indexEnd; indexIn += input.stride) {
				total += input.data[indexIn] ;
			}
			totals[x] = total;
			output.data[indexOut] = (total/divisor);
		}

		// change the order it is processed in to reduce cache misses
		for( int y = y0+1; y < y1; y++ ) {
			int indexIn = input.startIndex + (y+radius)*input.stride;
			int indexOut = output.startIndex + y*output.stride;

			for( int x = 0; x < input.width; x++ ,indexIn++,indexOut++) {
				double total = totals[ x ]  - (input.data[ indexIn - backStep ]);
				totals[ x ] = total += input.data[ indexIn ];

				output.data[indexOut] = (total/divisor);
			}
		}
		_work.recycle(totals);
		});
	}
}
