/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.dsi.mg4j.document;

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.mg4j.document.Document;
import it.unimi.dsi.mg4j.document.DocumentCollectionBuilder;
import it.unimi.dsi.mg4j.document.DocumentFactory;
import it.unimi.dsi.mg4j.document.DocumentIterator;
import it.unimi.dsi.mg4j.document.DocumentSequence;
import it.unimi.dsi.mg4j.document.SimpleCompressedDocumentCollection;
import it.unimi.dsi.mg4j.tool.Scan;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.CountingOutputStream;

public class SimpleCompressedDocumentCollectionBuilder
implements DocumentCollectionBuilder {
    private final DocumentFactory factory;
    private final boolean exact;
    private final SimpleCompressedDocumentCollection.FrequencyCodec termsFrequencyKeeper;
    private final SimpleCompressedDocumentCollection.FrequencyCodec nonTermsFrequencyKeeper;
    private String basename;
    private String basenameSuffix;
    private OutputBitStream documentsOutputBitStream;
    private CountingOutputStream termsOutputStream;
    private CountingOutputStream nonTermsOutputStream;
    private OutputBitStream documentOffsetsObs;
    private OutputBitStream termOffsetsObs;
    private OutputBitStream nonTermOffsetsObs;
    private IntArrayList fieldContent;
    private Object2IntOpenHashMap<MutableString> terms;
    private Object2IntOpenHashMap<MutableString> nonTerms;
    private int documents;
    private long words;
    private long fields;
    private long bitsForWords;
    private long bitsForNonWords;
    private long bitsForFieldLengths;
    private long bitsForUris;
    private long bitsForTitles;
    private boolean hasNonText;
    private ZipOutputStream nonTextZipOutputStream;
    private DataOutputStream nonTextZipDataOutputStream;

    public SimpleCompressedDocumentCollectionBuilder(String basename, DocumentFactory factory, boolean exact) {
        this.basename = basename;
        this.factory = factory;
        this.exact = exact;
        this.termsFrequencyKeeper = new SimpleCompressedDocumentCollection.FrequencyCodec();
        this.nonTermsFrequencyKeeper = exact ? new SimpleCompressedDocumentCollection.FrequencyCodec() : null;
        boolean hasNonText = false;
        int i = factory.numberOfFields();
        while (i-- != 0) {
            hasNonText |= factory.fieldType(i) != DocumentFactory.FieldType.TEXT;
        }
        this.hasNonText = hasNonText;
        this.terms = new Object2IntOpenHashMap(1000);
        this.terms.defaultReturnValue(-1);
        if (exact) {
            this.nonTerms = new Object2IntOpenHashMap(1000);
            this.nonTerms.defaultReturnValue(-1);
        } else {
            this.nonTerms = null;
        }
    }

    @Override
    public String basename() {
        return this.basename;
    }

    @Override
    public void open(CharSequence suffix) throws IOException {
        this.basenameSuffix = this.basename + suffix;
        this.documentsOutputBitStream = new OutputBitStream(this.basenameSuffix + ".documents");
        this.termsOutputStream = new CountingOutputStream((OutputStream)new FastBufferedOutputStream((OutputStream)new FileOutputStream(this.basenameSuffix + ".terms")));
        this.nonTermsOutputStream = this.exact ? new CountingOutputStream((OutputStream)new FastBufferedOutputStream((OutputStream)new FileOutputStream(this.basenameSuffix + ".nonterms"))) : null;
        this.documentOffsetsObs = new OutputBitStream(this.basenameSuffix + ".docoffsets");
        this.termOffsetsObs = new OutputBitStream(this.basenameSuffix + ".termoffsets");
        this.nonTermOffsetsObs = this.exact ? new OutputBitStream(this.basenameSuffix + ".nontermoffsets") : null;
        this.fieldContent = new IntArrayList();
        if (this.hasNonText) {
            this.nonTextZipOutputStream = new ZipOutputStream((OutputStream)new FastBufferedOutputStream((OutputStream)new FileOutputStream(this.basenameSuffix + ".zip")));
            this.nonTextZipDataOutputStream = new DataOutputStream(this.nonTextZipOutputStream);
        }
        this.terms.clear();
        this.terms.trim(1000);
        if (this.exact) {
            this.nonTerms.clear();
            this.nonTerms.trim(1000);
        }
        this.documents = 0;
        this.bitsForUris = this.bitsForTitles = (long)0;
        this.bitsForFieldLengths = this.bitsForTitles;
        this.bitsForNonWords = this.bitsForTitles;
        this.bitsForWords = this.bitsForTitles;
        this.fields = this.bitsForTitles;
        this.words = this.bitsForTitles;
        this.documentOffsetsObs.writeDelta(0);
        this.termOffsetsObs.writeDelta(0);
        if (this.exact) {
            this.nonTermOffsetsObs.writeDelta(0);
        }
    }

    @Override
    public void add(MutableString word, MutableString nonWord) throws IOException {
        int t = this.terms.getInt((Object)word);
        if (t == -1) {
            t = this.terms.size();
            this.terms.put((Object)word.copy(), t);
            this.termsOutputStream.resetByteCount();
            word.writeSelfDelimUTF8((OutputStream)this.termsOutputStream);
            this.termOffsetsObs.writeLongDelta(this.termsOutputStream.getByteCount());
        }
        this.fieldContent.add(t);
        if (this.exact) {
            t = this.nonTerms.getInt((Object)nonWord);
            if (t == -1) {
                t = this.nonTerms.size();
                this.nonTerms.put((Object)nonWord.copy(), t);
                this.nonTermsOutputStream.resetByteCount();
                nonWord.writeSelfDelimUTF8((OutputStream)this.nonTermsOutputStream);
                this.nonTermOffsetsObs.writeLongDelta(this.nonTermsOutputStream.getByteCount());
            }
            this.fieldContent.add(t);
        }
    }

    @Override
    public void close() throws IOException {
        this.documentsOutputBitStream.close();
        this.termsOutputStream.close();
        IOUtils.closeQuietly((OutputStream)this.nonTermsOutputStream);
        this.documentOffsetsObs.close();
        this.termOffsetsObs.close();
        if (this.nonTermOffsetsObs != null) {
            this.nonTermOffsetsObs.close();
        }
        if (this.hasNonText) {
            if (this.documents == 0) {
                this.nonTextZipOutputStream.putNextEntry(new ZipEntry("dummy"));
            }
            this.nonTextZipDataOutputStream.close();
        }
        SimpleCompressedDocumentCollection simpleCompressedDocumentCollection = new SimpleCompressedDocumentCollection(this.basenameSuffix, this.documents, this.terms.size(), this.nonTerms != null ? (long)this.nonTerms.size() : -1L, this.exact, this.factory);
        BinIO.storeObject((Object)simpleCompressedDocumentCollection, (CharSequence)(this.basenameSuffix + ".collection"));
        simpleCompressedDocumentCollection.close();
        PrintStream stats = new PrintStream(new FileOutputStream(this.basenameSuffix + ".stats"));
        long overallBits = this.bitsForTitles + this.bitsForUris + this.bitsForFieldLengths + this.bitsForWords + this.bitsForNonWords;
        stats.println("Documents: " + Util.format((long)this.documents) + " (" + Util.format((long)overallBits) + ", " + Util.format((double)((double)overallBits / (double)this.documents)) + " bits per document)");
        stats.println("Terms: " + Util.format((long)this.terms.size()) + " (" + Util.format((long)this.words) + " words, " + Util.format((long)this.bitsForWords) + " bits, " + Util.format((double)((double)this.bitsForWords / (double)this.words)) + " bits per word)");
        if (this.exact) {
            stats.println("Nonterms: " + Util.format((long)this.nonTerms.size()) + " (" + Util.format((long)this.words) + " nonwords, " + Util.format((long)this.bitsForNonWords) + " bits, " + Util.format((double)((double)this.bitsForNonWords / (double)this.words)) + " bits per nonword)");
        }
        stats.println("Bits for field lengths: " + Util.format((long)this.bitsForFieldLengths) + " (" + Util.format((double)((double)this.bitsForFieldLengths / (double)this.fields)) + " bits per field)");
        stats.println("Bits for URIs: " + Util.format((long)this.bitsForUris) + " (" + Util.format((double)((double)this.bitsForUris / (double)this.documents)) + " bits per URI)");
        stats.println("Bits for titles: " + Util.format((long)this.bitsForTitles) + " (" + Util.format((double)((double)this.bitsForTitles / (double)this.documents)) + " bits per title)");
        stats.close();
    }

    @Override
    public void endDocument() throws IOException {
        this.documentOffsetsObs.writeLongDelta(this.documentsOutputBitStream.writtenBits());
        if (this.hasNonText) {
            this.nonTextZipOutputStream.closeEntry();
        }
    }

    @Override
    public void endTextField() throws IOException {
        int size = this.fieldContent.size();
        this.words += (long)(size / (this.exact ? 2 : 1));
        this.bitsForFieldLengths += (long)this.documentsOutputBitStream.writeDelta(size / (this.exact ? 2 : 1));
        this.termsFrequencyKeeper.reset();
        if (this.exact) {
            this.nonTermsFrequencyKeeper.reset();
            for (int i = 0; i < size; i += 2) {
                this.bitsForWords += (long)this.documentsOutputBitStream.writeDelta(this.termsFrequencyKeeper.encode(this.fieldContent.getInt(i)));
                this.bitsForNonWords += (long)this.documentsOutputBitStream.writeDelta(this.nonTermsFrequencyKeeper.encode(this.fieldContent.getInt(i + 1)));
            }
        } else {
            for (int i = 0; i < size; ++i) {
                this.bitsForWords += (long)this.documentsOutputBitStream.writeDelta(this.termsFrequencyKeeper.encode(this.fieldContent.getInt(i)));
            }
        }
    }

    @Override
    public void nonTextField(Object o) throws IOException {
        ObjectOutputStream oos = new ObjectOutputStream(this.nonTextZipDataOutputStream);
        oos.writeObject(o);
        oos.flush();
    }

    public static int writeSelfDelimitedUtf8String(OutputBitStream obs, CharSequence s) throws IOException {
        int len = s.length();
        int bits = 0;
        bits += obs.writeDelta(len);
        for (int i = 0; i < len; ++i) {
            bits += obs.writeZeta((int)s.charAt(i), 7);
        }
        return bits;
    }

    @Override
    public void startDocument(CharSequence title, CharSequence uri) throws IOException {
        this.documentsOutputBitStream.writtenBits(0L);
        this.bitsForUris += (long)SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String(this.documentsOutputBitStream, uri == null ? "" : uri);
        this.bitsForTitles += (long)SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String(this.documentsOutputBitStream, title == null ? "" : title);
        if (this.hasNonText) {
            ZipEntry currEntry = new ZipEntry(Integer.toString(this.documents));
            this.nonTextZipOutputStream.putNextEntry(currEntry);
        }
        ++this.documents;
    }

    @Override
    public void startTextField() {
        this.fieldContent.size(0);
        ++this.fields;
    }

    @Override
    public void virtualField(ObjectList<Scan.VirtualDocumentFragment> fragments) throws IOException {
        this.nonTextZipDataOutputStream.writeInt(fragments.size());
        for (Scan.VirtualDocumentFragment fragment : fragments) {
            fragment.documentSpecifier().writeSelfDelimUTF8((OutputStream)this.nonTextZipOutputStream);
            fragment.text().writeSelfDelimUTF8((OutputStream)this.nonTextZipOutputStream);
        }
    }

    public void build(DocumentSequence inputSequence) throws IOException {
        Document document;
        DocumentIterator docIt = inputSequence.iterator();
        if (this.factory != inputSequence.factory()) {
            throw new IllegalStateException("The factory provided by the constructor does not correspond to the factory of the input sequence");
        }
        int numberOfFields = this.factory.numberOfFields();
        MutableString word = new MutableString();
        MutableString nonWord = new MutableString();
        this.open("");
        while ((document = docIt.nextDocument()) != null) {
            this.startDocument(document.title(), document.uri());
            for (int field = 0; field < numberOfFields; ++field) {
                Object content = document.content(field);
                if (this.factory.fieldType(field) == DocumentFactory.FieldType.TEXT) {
                    this.startTextField();
                    WordReader wordReader = document.wordReader(field);
                    wordReader.setReader((Reader)content);
                    while (wordReader.next(word, nonWord)) {
                        this.add(word, nonWord);
                    }
                    this.endTextField();
                    continue;
                }
                if (this.factory.fieldType(field) == DocumentFactory.FieldType.VIRTUAL) {
                    this.virtualField((ObjectList<Scan.VirtualDocumentFragment>)((ObjectList)content));
                    continue;
                }
                this.nonTextField(content);
            }
            document.close();
            this.endDocument();
        }
        docIt.close();
        this.close();
    }
}

