/*
 * Decompiled with CFR 0.152.
 */
package org.carrot2.text.preprocessing;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntStack;
import com.carrotsearch.hppc.ShortArrayList;
import com.carrotsearch.hppc.sorting.IndirectSort;
import java.util.ArrayList;
import java.util.Arrays;
import org.carrot2.core.attribute.Processing;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.SparseArray;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.constraint.IntRange;

@Bindable(prefix="CaseNormalizer")
public final class CaseNormalizer {
    @Processing
    @Input
    @Attribute
    @IntRange(min=1, max=100)
    @Label(value="Word document frequency threshold")
    @Level(value=AttributeLevel.ADVANCED)
    @Group(value="Preprocessing")
    public int dfThreshold = 1;

    public void normalize(PreprocessingContext context) {
        char[][] tokenImages = context.allTokens.image;
        short[] tokenTypesArray = context.allTokens.type;
        int[] documentIndexesArray = context.allTokens.documentIndex;
        byte[] tokensFieldIndex = context.allTokens.fieldIndex;
        int tokenCount = tokenImages.length;
        int[] tokenImagesOrder = IndirectSort.mergesort((Object[])tokenImages, (int)0, (int)tokenImages.length, CharArrayComparators.NORMALIZING_CHAR_ARRAY_COMPARATOR);
        ArrayList normalizedWordImages = Lists.newArrayList();
        IntArrayList normalizedWordTf = new IntArrayList();
        ArrayList wordTfByDocumentList = Lists.newArrayList();
        ByteArrayList fieldIndexList = new ByteArrayList();
        ShortArrayList types = new ShortArrayList();
        int[] wordIndexes = new int[tokenCount];
        Arrays.fill(wordIndexes, -1);
        int tf = 1;
        int maxTf = 1;
        int maxTfVariantIndex = tokenImagesOrder[0];
        int totalTf = 1;
        int variantStartIndex = 0;
        BitSet fieldIndices = new BitSet((long)context.allFields.name.length);
        IntStack wordDocuments = new IntStack();
        if (documentIndexesArray[tokenImagesOrder[0]] >= 0) {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[0]]);
        }
        for (int i = 0; i < tokenImagesOrder.length - 1; ++i) {
            int[] sparseEncoding;
            int df;
            boolean sameImage;
            boolean sameCase;
            char[] image = tokenImages[tokenImagesOrder[i]];
            char[] nextImage = tokenImages[tokenImagesOrder[i + 1]];
            short tokenType = tokenTypesArray[tokenImagesOrder[i]];
            int documentIndex = documentIndexesArray[tokenImagesOrder[i + 1]];
            if (image == null) break;
            if (this.isNotIndexed(tokenType)) {
                variantStartIndex = i + 1;
                maxTfVariantIndex = tokenImagesOrder[i + 1];
                this.resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, fieldIndices, wordDocuments, i);
                continue;
            }
            fieldIndices.set((long)tokensFieldIndex[tokenImagesOrder[i]]);
            boolean bl = sameCase = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR.compare(image, nextImage) == 0;
            if (sameCase) {
                ++tf;
                ++totalTf;
                wordDocuments.push(documentIndex);
                continue;
            }
            if (maxTf < tf) {
                maxTf = tf;
                maxTfVariantIndex = tokenImagesOrder[i];
                tf = 1;
            }
            boolean bl2 = sameImage = CharArrayComparators.CASE_INSENSITIVE_CHAR_ARRAY_COMPARATOR.compare(image, nextImage) == 0;
            if (sameImage) {
                ++totalTf;
                wordDocuments.push(documentIndex);
                continue;
            }
            if (wordDocuments.size() >= this.dfThreshold && (df = (sparseEncoding = SparseArray.toSparseEncoding(wordDocuments)).length >> 1) >= this.dfThreshold) {
                wordTfByDocumentList.add(sparseEncoding);
                normalizedWordImages.add(tokenImages[maxTfVariantIndex]);
                types.add(tokenTypesArray[maxTfVariantIndex]);
                normalizedWordTf.add(totalTf);
                fieldIndexList.add((byte)fieldIndices.bits[0]);
                for (int j = variantStartIndex; j < i + 1; ++j) {
                    wordIndexes[tokenImagesOrder[j]] = normalizedWordImages.size() - 1;
                }
            }
            totalTf = 1;
            tf = 1;
            maxTf = 1;
            maxTfVariantIndex = tokenImagesOrder[i + 1];
            variantStartIndex = i + 1;
            this.resetForNewTokenImage(documentIndexesArray, tokenImagesOrder, fieldIndices, wordDocuments, i);
        }
        context.allTokens.wordIndex = wordIndexes;
        context.allWords.image = (char[][])normalizedWordImages.toArray((T[])new char[normalizedWordImages.size()][]);
        context.allWords.tf = normalizedWordTf.toArray();
        context.allWords.tfByDocument = (int[][])wordTfByDocumentList.toArray((T[])new int[wordTfByDocumentList.size()][]);
        context.allWords.fieldIndices = fieldIndexList.toArray();
        context.allWords.type = types.toArray();
    }

    private void resetForNewTokenImage(int[] documentIndexesArray, int[] tokenImagesOrder, BitSet fieldIndices, IntStack wordDocuments, int i) {
        fieldIndices.clear();
        wordDocuments.clear();
        if (documentIndexesArray[tokenImagesOrder[i + 1]] >= 0) {
            wordDocuments.push(documentIndexesArray[tokenImagesOrder[i + 1]]);
        }
    }

    private boolean isNotIndexed(int tokenType) {
        return tokenType == 3 || tokenType == 6 || (tokenType & 0x100) != 0;
    }
}

