/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.collection.trecweb;

import edu.umd.cloud9.collection.WebDocument;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.WritableUtils;

public class TrecWebDocument
extends WebDocument {
    public static final String XML_START_TAG = "<DOC>";
    public static final String XML_END_TAG = "</DOC>";
    private String docid;
    private String content;
    private String url;
    private static DataInputStream fsin;
    private static byte[] startTag;
    private static byte[] endTag;
    private static DataOutputBuffer buffer;

    public TrecWebDocument() {
        try {
            startTag = XML_START_TAG.getBytes("utf-8");
            endTag = XML_END_TAG.getBytes("utf-8");
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.docid);
        byte[] bytes = this.content.getBytes("UTF-8");
        WritableUtils.writeVInt((DataOutput)out, (int)bytes.length);
        out.write(bytes, 0, bytes.length);
    }

    public void readFields(DataInput in) throws IOException {
        this.docid = in.readUTF();
        int length = WritableUtils.readVInt((DataInput)in);
        byte[] bytes = new byte[length];
        in.readFully(bytes, 0, length);
        this.content = new String(bytes, "UTF-8");
    }

    public String getDocid() {
        return this.docid;
    }

    public String getContent() {
        return this.content;
    }

    public String getURL() {
        return this.url;
    }

    public static void readDocument(TrecWebDocument doc, String s) {
        if (s == null) {
            throw new RuntimeException("Error, can't read null string!");
        }
        int start = s.indexOf("<DOCNO>");
        if (start == -1) {
            throw new RuntimeException("Unable to find DOCNO tag!");
        }
        int end = s.indexOf("</DOCNO>", start);
        doc.docid = s.substring(start + 7, end);
        start = s.indexOf("<DOCHDR>");
        if (start == -1) {
            throw new RuntimeException("Unable to find DOCHDR tag!");
        }
        end = s.indexOf(" ", start);
        doc.url = s.substring(start + 9, end);
        start = s.indexOf("</DOCHDR>");
        if (start == -1) {
            throw new RuntimeException("Unable to find DOCHDR tag!");
        }
        end = s.length() - 6;
        doc.content = s.substring(start + 9, end);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static boolean readNextTrecWebDocument(TrecWebDocument doc, DataInputStream stream) throws IOException {
        fsin = stream;
        if (TrecWebDocument.readUntilMatch(startTag, false)) {
            try {
                buffer.write(startTag);
                if (TrecWebDocument.readUntilMatch(endTag, true)) {
                    String s = new String(buffer.getData());
                    TrecWebDocument.readDocument(doc, s);
                    boolean bl = true;
                    return bl;
                }
            }
            finally {
                buffer.reset();
            }
        }
        return false;
    }

    private static boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
        int i = 0;
        int b;
        while ((b = fsin.read()) != -1) {
            if (withinBlock) {
                buffer.write(b);
            }
            if (b == match[i]) {
                if (++i < match.length) continue;
                return true;
            }
            i = 0;
        }
        return false;
    }

    static {
        buffer = new DataOutputBuffer();
    }
}

