/*
 * Decompiled with CFR 0.152.
 */
package boofcv.alg.filter.convolve.noborder;

import boofcv.concurrency.BoofConcurrency;
import boofcv.struct.convolve.Kernel1D_F32;
import boofcv.struct.convolve.Kernel1D_F64;
import boofcv.struct.convolve.Kernel1D_S32;
import boofcv.struct.convolve.Kernel2D_F32;
import boofcv.struct.convolve.Kernel2D_F64;
import boofcv.struct.convolve.Kernel2D_S32;
import boofcv.struct.image.InterleavedF32;
import boofcv.struct.image.InterleavedF64;
import boofcv.struct.image.InterleavedI16;
import boofcv.struct.image.InterleavedI8;
import boofcv.struct.image.InterleavedS16;
import boofcv.struct.image.InterleavedS32;
import boofcv.struct.image.InterleavedU16;
import boofcv.struct.image.InterleavedU8;
import java.util.Arrays;

public class ConvolveImageStandard_IL_MT {
    public static void horizontal(Kernel1D_F32 kernel, InterleavedF32 src, InterleavedF32 dst) {
        float[] dataSrc = src.data;
        float[] dataDst = dst.data;
        float[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    float total = 0.0f;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_F32 kernel, InterleavedF32 src, InterleavedF32 dst) {
        float[] dataSrc = src.data;
        float[] dataDst = dst.data;
        float[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, 0.0f);
            for (int k = 0; k < kernelWidth; ++k) {
                float kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + dataSrc[indexSrc++] * kernelValue;
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_F32 kernel, InterleavedF32 src, InterleavedF32 dst) {
        float[] dataKernel = kernel.data;
        float[] dataSrc = src.data;
        float[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    float total = 0.0f;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += dataSrc[indexSrc] * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void horizontal(Kernel1D_F64 kernel, InterleavedF64 src, InterleavedF64 dst) {
        double[] dataSrc = src.data;
        double[] dataDst = dst.data;
        double[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    double total = 0.0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_F64 kernel, InterleavedF64 src, InterleavedF64 dst) {
        double[] dataSrc = src.data;
        double[] dataDst = dst.data;
        double[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, 0.0);
            for (int k = 0; k < kernelWidth; ++k) {
                double kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + dataSrc[indexSrc++] * kernelValue;
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_F64 kernel, InterleavedF64 src, InterleavedF64 dst) {
        double[] dataKernel = kernel.data;
        double[] dataSrc = src.data;
        double[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    double total = 0.0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += dataSrc[indexSrc] * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedI16 dst) {
        byte[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += (dataSrc[indexSrc] & 0xFF) * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (short)total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedI16 dst) {
        byte[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, (short)0);
            for (int k = 0; k < kernelWidth; ++k) {
                int kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = (short)(dataDst[n] + (short)((dataSrc[indexSrc++] & 0xFF) * kernelValue));
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedU8 src, InterleavedI16 dst) {
        int[] dataKernel = kernel.data;
        byte[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += (dataSrc[indexSrc] & 0xFF) * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (short)total;
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedS32 dst) {
        byte[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += (dataSrc[indexSrc] & 0xFF) * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedS32 dst) {
        byte[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, 0);
            for (int k = 0; k < kernelWidth; ++k) {
                int kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + (dataSrc[indexSrc++] & 0xFF) * kernelValue;
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedU8 src, InterleavedS32 dst) {
        int[] dataKernel = kernel.data;
        byte[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += (dataSrc[indexSrc] & 0xFF) * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedU16 src, InterleavedI8 dst, int divisor) {
        short[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDst = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            for (int x = 0; x < imgWidth; ++x) {
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    for (int k = 0; k < kernelWidth; ++k) {
                        total += (dataSrc[indexSrc] & 0xFFFF) * dataKer[k];
                        indexSrc += src.stride;
                    }
                    dataDst[indexDst++] = (byte)((total + halfDivisor) / divisor);
                }
                indexSrcStart += numBands;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedS16 src, InterleavedI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (short)total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedS16 src, InterleavedI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, (short)0);
            for (int k = 0; k < kernelWidth; ++k) {
                int kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = (short)(dataDst[n] + (short)(dataSrc[indexSrc++] * kernelValue));
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedS16 src, InterleavedI16 dst) {
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += dataSrc[indexSrc] * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (short)total;
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedI8 dst, int divisor) {
        byte[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += (dataSrc[indexSrc] & 0xFF) * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (byte)((total + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedI8 dst, int divisor) {
        byte[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDst = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            for (int x = 0; x < imgWidth; ++x) {
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    for (int k = 0; k < kernelWidth; ++k) {
                        total += (dataSrc[indexSrc] & 0xFF) * dataKer[k];
                        indexSrc += src.stride;
                    }
                    dataDst[indexDst++] = (byte)((total + halfDivisor) / divisor);
                }
                indexSrcStart += numBands;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedU8 src, InterleavedI8 dst, int divisor) {
        int[] dataKernel = kernel.data;
        byte[] dataSrc = src.data;
        byte[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += (dataSrc[indexSrc] & 0xFF) * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (byte)((total + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedS16 src, InterleavedI16 dst, int divisor) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedS16 src, InterleavedI16 dst, int divisor) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDst = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            for (int x = 0; x < imgWidth; ++x) {
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    for (int k = 0; k < kernelWidth; ++k) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        indexSrc += src.stride;
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
                indexSrcStart += numBands;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedS16 src, InterleavedI16 dst, int divisor) {
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += dataSrc[indexSrc] * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedU16 src, InterleavedI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += (dataSrc[indexSrc] & 0xFFFF) * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (short)total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedU16 src, InterleavedI16 dst) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, (short)0);
            for (int k = 0; k < kernelWidth; ++k) {
                int kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = (short)(dataDst[n] + (short)((dataSrc[indexSrc++] & 0xFFFF) * kernelValue));
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedU16 src, InterleavedI16 dst) {
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += (dataSrc[indexSrc] & 0xFFFF) * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (short)total;
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedU16 src, InterleavedI16 dst, int divisor) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += (dataSrc[indexSrc] & 0xFFFF) * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedU16 src, InterleavedI16 dst, int divisor) {
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDst = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            for (int x = 0; x < imgWidth; ++x) {
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    for (int k = 0; k < kernelWidth; ++k) {
                        total += (dataSrc[indexSrc] & 0xFFFF) * dataKer[k];
                        indexSrc += src.stride;
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
                indexSrcStart += numBands;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedU16 src, InterleavedI16 dst, int divisor) {
        int[] dataKernel = kernel.data;
        short[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += (dataSrc[indexSrc] & 0xFFFF) * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedS32 src, InterleavedI16 dst, int divisor) {
        int[] dataSrc = src.data;
        short[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDst = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            for (int x = 0; x < imgWidth; ++x) {
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    for (int k = 0; k < kernelWidth; ++k) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        indexSrc += src.stride;
                    }
                    dataDst[indexDst++] = (short)((total + halfDivisor) / divisor);
                }
                indexSrcStart += numBands;
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedS32 src, InterleavedS32 dst) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedS32 src, InterleavedS32 dst) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        int elementsInRow = imgWidth * numBands;
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDstStart = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            Arrays.fill(dataDst, indexDstStart, indexDstStart + elementsInRow, 0);
            for (int k = 0; k < kernelWidth; ++k) {
                int kernelValue = dataKer[k];
                int indexDst = indexDstStart;
                int indexSrc = indexSrcStart;
                int indexSrcEnd = indexSrc + elementsInRow;
                while (indexSrc < indexSrcEnd) {
                    int n = indexDst++;
                    dataDst[n] = dataDst[n] + dataSrc[indexSrc++] * kernelValue;
                }
                indexSrcStart += src.stride;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedS32 src, InterleavedS32 dst) {
        int[] dataKernel = kernel.data;
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += dataSrc[indexSrc] * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = total;
                }
            }
        });
    }

    public static void horizontal(Kernel1D_S32 kernel, InterleavedS32 src, InterleavedS32 dst, int divisor) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int endJ = src.width - (kernelWidth - 1);
        BoofConcurrency.loopFor((int)0, (int)src.height, i -> {
            int indexDst = dst.startIndex + i * dst.stride + offset * numBands;
            for (int j = 0; j < endJ; ++j) {
                int indexSrcStart = src.startIndex + i * src.stride + j * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    int k = 0;
                    while (k < kernelWidth) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        ++k;
                        indexSrc += numBands;
                    }
                    dataDst[indexDst++] = (total + halfDivisor) / divisor;
                }
            }
        });
    }

    public static void vertical(Kernel1D_S32 kernel, InterleavedS32 src, InterleavedS32 dst, int divisor) {
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int[] dataKer = kernel.data;
        int offset = kernel.getOffset();
        int kernelWidth = kernel.getWidth();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int imgWidth = dst.getWidth();
        int imgHeight = dst.getHeight();
        int yEnd = imgHeight - (kernelWidth - offset - 1);
        BoofConcurrency.loopFor((int)offset, (int)yEnd, y -> {
            int indexDst = dst.startIndex + y * dst.stride;
            int indexSrcStart = src.startIndex + (y - offset) * src.stride;
            for (int x = 0; x < imgWidth; ++x) {
                for (int band = 0; band < numBands; ++band) {
                    int indexSrc = indexSrcStart + band;
                    int total = 0;
                    for (int k = 0; k < kernelWidth; ++k) {
                        total += dataSrc[indexSrc] * dataKer[k];
                        indexSrc += src.stride;
                    }
                    dataDst[indexDst++] = (total + halfDivisor) / divisor;
                }
                indexSrcStart += numBands;
            }
        });
    }

    public static void convolve(Kernel2D_S32 kernel, InterleavedS32 src, InterleavedS32 dst, int divisor) {
        int[] dataKernel = kernel.data;
        int[] dataSrc = src.data;
        int[] dataDst = dst.data;
        int width = src.getWidth();
        int height = src.getHeight();
        int numBands = src.getNumBands();
        int halfDivisor = divisor / 2;
        int offsetL = kernel.offset;
        int offsetR = kernel.width - kernel.offset - 1;
        BoofConcurrency.loopFor((int)offsetL, (int)(height - offsetR), y -> {
            int indexDst = dst.startIndex + y * dst.stride + offsetL * numBands;
            for (int x = offsetL; x < width - offsetR; ++x) {
                int indexSrcStart = src.startIndex + (y - offsetL) * src.stride + (x - offsetL) * numBands;
                for (int band = 0; band < numBands; ++band) {
                    int total = 0;
                    int indexKer = 0;
                    for (int ki = 0; ki < kernel.width; ++ki) {
                        int indexSrc = indexSrcStart + ki * src.stride + band;
                        for (int kj = 0; kj < kernel.width; ++kj) {
                            total += dataSrc[indexSrc] * dataKernel[indexKer++];
                            indexSrc += numBands;
                        }
                    }
                    dataDst[indexDst++] = (total + halfDivisor) / divisor;
                }
            }
        });
    }
}

