/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.dsi.mg4j.document;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.StringParser;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.bytes.ByteArrays;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastByteArrayInputStream;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongIterable;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.MultipleInputStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.mg4j.document.AbstractDocumentCollection;
import it.unimi.dsi.mg4j.document.AbstractDocumentIterator;
import it.unimi.dsi.mg4j.document.Document;
import it.unimi.dsi.mg4j.document.DocumentFactory;
import it.unimi.dsi.mg4j.document.DocumentIterator;
import it.unimi.dsi.mg4j.document.IdentityDocumentFactory;
import it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.dsi.mg4j.document.ReplicatedDocumentFactory;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;
import org.apache.log4j.Logger;

public class WikipediaDocumentCollection
extends AbstractDocumentCollection
implements Serializable {
    private static final Logger LOGGER = Util.getLogger(WikipediaDocumentCollection.class);
    private static final long serialVersionUID = 1L;
    private static final byte[] META_MARKER = "%%#".getBytes();
    private static final byte[] DOC_MARKER = "%%#DOC".getBytes();
    private static final byte[] PAGE_MARKER = "%%#PAGE".getBytes();
    private static final byte[] SENTENCE_MARKER = "%%#SEN".getBytes();
    private static final int NUM_FIELDS = 10;
    private static final String[] FIELD_NAME = new String[]{"token", "POS", "lemma", "CONL", "WNSS", "WSJ", "ana", "head", "deplabel", "link"};
    private final String[] file;
    private boolean gzipped;
    private final DocumentFactory factory;
    private final ObjectArrayList<EliasFanoMonotoneLongBigList> pointers;
    private final int size;
    private final boolean phrase;
    private final int[] firstDocument;
    private transient byte[][] buffer;
    private transient byte[] lineBuffer;
    private transient int[] bufferSize;
    private transient Reference2ObjectMap<Enum<?>, Object> metadata;
    private transient int lastDocument;

    private final void initBuffers() {
        this.bufferSize = new int[10];
        this.buffer = new byte[10][];
        this.lineBuffer = ByteArrays.EMPTY_ARRAY;
        this.lastDocument = -1;
        this.metadata = new Reference2ObjectArrayMap();
        int i = 10;
        while (i-- != 0) {
            this.buffer[i] = ByteArrays.EMPTY_ARRAY;
        }
    }

    public WikipediaDocumentCollection(String[] file, DocumentFactory factory, boolean phrase) throws IOException {
        this(file, factory, phrase, false);
    }

    public WikipediaDocumentCollection(String[] file, DocumentFactory factory, boolean phrase, boolean gzipped) throws IOException {
        this.file = file;
        this.factory = factory;
        this.gzipped = gzipped;
        this.phrase = phrase;
        this.initBuffers();
        LongArrayList p = new LongArrayList();
        this.pointers = new ObjectArrayList(file.length);
        this.firstDocument = new int[file.length + 1];
        int count = 0;
        ProgressLogger pl = new ProgressLogger(LOGGER);
        pl.expectedUpdates = file.length;
        pl.itemsName = "files";
        pl.start((CharSequence)"Scanning files...");
        for (String f : file) {
            p.clear();
            FastBufferedInputStream fbis = gzipped ? new FastBufferedInputStream((InputStream)new GZIPInputStream(new FileInputStream(f))) : new FastBufferedInputStream((InputStream)new FileInputStream(f));
            while (true) {
                long position = fbis.position();
                if (this.readLine(fbis) == -1) break;
                if (WikipediaDocumentCollection.startsWith(this.lineBuffer, DOC_MARKER)) {
                    p.add(position);
                }
                if (!phrase || !WikipediaDocumentCollection.startsWith(this.lineBuffer, SENTENCE_MARKER)) continue;
                p.add(position);
            }
            count += p.size();
            p.add(fbis.position());
            fbis.close();
            this.pointers.add((Object)new EliasFanoMonotoneLongBigList((LongIterable)p));
            this.firstDocument[this.pointers.size()] = count;
            pl.update();
        }
        pl.done();
        this.size = count;
    }

    private final int readLine(FastBufferedInputStream fbis) throws IOException {
        int len;
        int start = 0;
        while ((len = fbis.readLine(this.lineBuffer, start, this.lineBuffer.length - start, FastBufferedInputStream.ALL_TERMINATORS)) == this.lineBuffer.length - start) {
            start += len;
            this.lineBuffer = ByteArrays.grow((byte[])this.lineBuffer, (int)(this.lineBuffer.length + 1));
        }
        if (len != -1) {
            start += len;
        }
        return len == -1 ? -1 : start;
    }

    protected WikipediaDocumentCollection(String[] file, DocumentFactory factory, ObjectArrayList<EliasFanoMonotoneLongBigList> pointers, int size, int[] firstDocument, boolean phrase, boolean gzipped) {
        this.file = file;
        this.factory = factory;
        this.pointers = pointers;
        this.size = size;
        this.firstDocument = firstDocument;
        this.gzipped = gzipped;
        this.phrase = phrase;
        this.initBuffers();
    }

    private static boolean startsWith(byte[] array, byte[] pattern) {
        int length = pattern.length;
        if (array.length < length) {
            return false;
        }
        while (length-- != 0) {
            if (array[length] == pattern[length]) continue;
            return false;
        }
        return true;
    }

    @Override
    public DocumentFactory factory() {
        return this.factory;
    }

    @Override
    public int size() {
        return this.size;
    }

    @Override
    public Reference2ObjectMap<Enum<?>, Object> metadata(int index) throws IOException {
        this.readDocument(index, -1, null);
        if (!this.metadata.containsKey((Object)PropertyBasedDocumentFactory.MetadataKeys.TITLE)) {
            this.metadata.put((Object)PropertyBasedDocumentFactory.MetadataKeys.TITLE, (Object)("Sentence #" + (index + 1)));
        }
        return this.metadata;
    }

    @Override
    public Document document(int index) throws IOException {
        return this.factory.getDocument(this.stream(index), this.metadata(index));
    }

    @Override
    public InputStream stream(int index) throws IOException {
        this.readDocument(index, -1, null);
        FastByteArrayInputStream[] is = new FastByteArrayInputStream[10];
        for (int i = 0; i < 10; ++i) {
            is[i] = new FastByteArrayInputStream(this.buffer[i], 0, this.bufferSize[i]);
        }
        return MultipleInputStream.getStream((InputStream[])is);
    }

    @Override
    public DocumentIterator iterator() throws IOException {
        return new AbstractDocumentIterator(){
            private int index = 0;
            private int f = 0;
            private FastBufferedInputStream fbis = new FastBufferedInputStream((InputStream)new FileInputStream(WikipediaDocumentCollection.access$000(WikipediaDocumentCollection.this)[0]));

            @Override
            public void close() throws IOException {
                super.close();
                if (this.fbis != null) {
                    this.fbis.close();
                    this.fbis = null;
                }
            }

            @Override
            public Document nextDocument() throws IOException {
                if (this.index == WikipediaDocumentCollection.this.size) {
                    return null;
                }
                if (this.index == WikipediaDocumentCollection.this.firstDocument[this.f + 1]) {
                    this.fbis.close();
                    this.fbis = new FastBufferedInputStream((InputStream)new FileInputStream(WikipediaDocumentCollection.this.file[++this.f]));
                }
                WikipediaDocumentCollection.this.readDocument(this.index, this.f, this.fbis);
                return WikipediaDocumentCollection.this.document(this.index++);
            }
        };
    }

    private void readDocument(int index, int f, FastBufferedInputStream fbis) throws IOException {
        boolean openStream;
        this.ensureDocumentIndex(index);
        if (index == this.lastDocument) {
            return;
        }
        boolean bl = openStream = fbis == null;
        if (openStream) {
            f = Arrays.binarySearch(this.firstDocument, index);
            if (f < 0) {
                f = -f - 2;
            }
            fbis = new FastBufferedInputStream((InputStream)new FileInputStream(this.file[f]));
        }
        long start = ((EliasFanoMonotoneLongBigList)this.pointers.get(f)).getLong(index - this.firstDocument[f]);
        fbis.position(start);
        long end = ((EliasFanoMonotoneLongBigList)this.pointers.get(f)).getLong(index - this.firstDocument[f] + 1);
        IntArrays.fill((int[])this.bufferSize, (int)0);
        this.metadata.clear();
        while (fbis.position() < end) {
            int i;
            int l = this.readLine(fbis);
            if (WikipediaDocumentCollection.startsWith(this.lineBuffer, META_MARKER)) {
                boolean startOfSentence = false;
                boolean startOfPage = false;
                if (WikipediaDocumentCollection.startsWith(this.lineBuffer, DOC_MARKER) && this.phrase) {
                    return;
                }
                if (WikipediaDocumentCollection.startsWith(this.lineBuffer, PAGE_MARKER)) {
                    startOfPage = true;
                } else if (WikipediaDocumentCollection.startsWith(this.lineBuffer, SENTENCE_MARKER)) {
                    startOfSentence = true;
                }
                if (startOfPage) {
                    String title = new String(this.lineBuffer, Math.min(PAGE_MARKER.length + 1, l), Math.max(l - PAGE_MARKER.length - 1, 0), "UTF-8").trim();
                    this.metadata.put((Object)PropertyBasedDocumentFactory.MetadataKeys.TITLE, (Object)title);
                    this.metadata.put((Object)PropertyBasedDocumentFactory.MetadataKeys.URI, (Object)("http://en.wikipedia.org/wiki/" + URLEncoder.encode(title, "UTF-8")));
                }
                if (!startOfPage && !startOfSentence || this.phrase) continue;
                for (i = 0; i < 10; ++i) {
                    this.buffer[i] = ByteArrays.grow((byte[])this.buffer[i], (int)(this.bufferSize[i] + 3));
                    int n = i;
                    int n2 = this.bufferSize[n];
                    this.bufferSize[n] = n2 + 1;
                    this.buffer[i][n2] = -62;
                    int n3 = i;
                    int n4 = this.bufferSize[n3];
                    this.bufferSize[n3] = n4 + 1;
                    this.buffer[i][n4] = -74;
                    int n5 = i;
                    int n6 = this.bufferSize[n5];
                    this.bufferSize[n5] = n6 + 1;
                    this.buffer[i][n6] = 10;
                }
                continue;
            }
            int field = 0;
            for (i = 0; i < l; ++i) {
                if (this.lineBuffer[i] == 9) {
                    ++field;
                    continue;
                }
                this.buffer[field] = ByteArrays.grow((byte[])this.buffer[field], (int)(this.bufferSize[field] + 2));
                int n = field;
                int n7 = this.bufferSize[n];
                this.bufferSize[n] = n7 + 1;
                this.buffer[field][n7] = this.lineBuffer[i];
                if (i != l - 1 && this.lineBuffer[i + 1] != 9) continue;
                int n8 = field;
                int n9 = this.bufferSize[n8];
                this.bufferSize[n8] = n9 + 1;
                this.buffer[field][n9] = 32;
            }
        }
        if (openStream) {
            fbis.close();
        }
    }

    @Override
    public WikipediaDocumentCollection copy() {
        return new WikipediaDocumentCollection(this.file, this.factory.copy(), this.pointers, this.size, this.firstDocument, this.phrase, this.gzipped);
    }

    private void readObject(ObjectInputStream s) throws IOException, ClassNotFoundException {
        s.defaultReadObject();
        this.initBuffers();
    }

    public static void main(String[] arg) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
        SimpleJSAP jsap = new SimpleJSAP(WikipediaDocumentCollection.class.getName(), "Saves a serialised document collection based on a set of files.", new Parameter[]{new Switch("sentence", 's', "sentence", "Index sentences rather than documents."), new Switch("gzipped", 'z', "gzipped", "The files are gzipped."), new UnflaggedOption("collection", (StringParser)JSAP.STRING_PARSER, true, "The filename for the serialised collection."), new UnflaggedOption("file", (StringParser)JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, true, "A list of files that will be indexed. If missing, a list of files will be read from standard input.")});
        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted()) {
            return;
        }
        IdentityDocumentFactory factory = new IdentityDocumentFactory((Reference2ObjectMap<Enum<?>, Object>)new Reference2ObjectOpenHashMap((Object[])new PropertyBasedDocumentFactory.MetadataKeys[]{PropertyBasedDocumentFactory.MetadataKeys.ENCODING, PropertyBasedDocumentFactory.MetadataKeys.WORDREADER}, new Object[]{"UTF-8", WhitespaceWordReader.class.getName()}));
        String[] file = (String[])jsapResult.getObjectArray("file", (Object[])new String[0]);
        if (file.length == 0) {
            String s;
            ObjectArrayList files = new ObjectArrayList();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in));
            while ((s = bufferedReader.readLine()) != null) {
                files.add((Object)s);
            }
            file = (String[])files.toArray((Object[])new String[0]);
        }
        if (file.length == 0) {
            System.err.println("WARNING: empty file set.");
        }
        BinIO.storeObject((Object)new WikipediaDocumentCollection(file, ReplicatedDocumentFactory.getFactory(factory, 10, FIELD_NAME), jsapResult.getBoolean("sentence"), jsapResult.getBoolean("gzipped")), (CharSequence)jsapResult.getString("collection"));
    }

    public static class WhitespaceWordReader
    extends FastBufferedReader {
        private static final long serialVersionUID = 1L;

        protected boolean isWordConstituent(char c) {
            return !Character.isWhitespace(c);
        }
    }
}

