/*
 * Decompiled with CFR 0.152.
 */
package com.nexwave.nquindexer;

import com.nexwave.nquindexer.SaxDocFileParser;
import com.nexwave.nsidita.DocFileInfo;
import com.nexwave.stemmer.snowball.SnowballStemmer;
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;
import com.nexwave.stemmer.snowball.ext.GermanStemmer;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.AbstractCollection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

public class SaxHTMLIndex
extends SaxDocFileParser {
    private Map<String, String> tempDico;
    private int i = 0;
    private ArrayList<String> cleanUpList = null;
    private ArrayList<String> cleanUpPunctuation = null;

    public SaxHTMLIndex() {
    }

    public SaxHTMLIndex(ArrayList<String> arrayList) {
        this.cleanUpList = arrayList;
    }

    public SaxHTMLIndex(ArrayList<String> arrayList, ArrayList<String> arrayList2) {
        this.cleanUpList = arrayList;
        this.cleanUpPunctuation = arrayList2;
    }

    public int init(Map<String, String> map) {
        this.tempDico = map;
        return 0;
    }

    public DocFileInfo runExtractData(File file, String string) {
        String[] stringArray;
        Object object;
        Object object2;
        Object object3;
        Object object4;
        this.fileDesc = new DocFileInfo(file);
        this.strbf = new StringBuffer("");
        this.parseDocument(file);
        String string2 = this.cleanBuffer(this.strbf);
        string2 = string2.replaceAll("\\s+", " ");
        String[] stringArray2 = string2.split("\\s");
        if (string.equalsIgnoreCase("ja") || string.equalsIgnoreCase("zh") || string.equalsIgnoreCase("ko")) {
            object4 = new LinkedList();
            try {
                object3 = new CJKAnalyzer(Version.LUCENE_30);
                object2 = new StringReader(string2);
                object = object3.tokenStream("", (Reader)object2);
                TermAttribute termAttribute = (TermAttribute)object.addAttribute(TermAttribute.class);
                OffsetAttribute offsetAttribute = (OffsetAttribute)object.addAttribute(OffsetAttribute.class);
                while (object.incrementToken()) {
                    String string3 = termAttribute.term();
                    ((LinkedList)object4).add(string3);
                }
                stringArray = ((LinkedList)object4).toArray(new String[((LinkedList)object4).size()]);
            }
            catch (IOException iOException) {
                stringArray = stringArray2;
                System.out.println("Error tokenizing content using CJK Analyzer. IOException");
                iOException.printStackTrace();
            }
        } else {
            object4 = string.equalsIgnoreCase("en") ? new EnglishStemmer() : (string.equalsIgnoreCase("de") ? new GermanStemmer() : (string.equalsIgnoreCase("fr") ? new FrenchStemmer() : null));
            stringArray = object4 != null ? ((SnowballStemmer)object4).doStem(stringArray2) : stringArray2;
        }
        object4 = new HashSet();
        ((AbstractCollection)object4).addAll(Arrays.asList(stringArray));
        object3 = ((HashSet)object4).iterator();
        while (object3.hasNext()) {
            object2 = (String)object3.next();
            if (this.tempDico.containsKey(object2)) {
                object = this.tempDico.get(object2);
                object = ((String)object).concat(",").concat(Integer.toString(this.i));
                this.tempDico.put((String)object2, (String)object);
                continue;
            }
            this.tempDico.put((String)object2, Integer.toString(this.i));
        }
        ++this.i;
        return this.fileDesc;
    }

    private String cleanBuffer(StringBuffer stringBuffer) {
        Iterator<String> iterator;
        String string = stringBuffer.toString().toLowerCase();
        StringBuffer stringBuffer2 = new StringBuffer("");
        StringBuffer stringBuffer3 = new StringBuffer("");
        if (this.cleanUpList == null || this.cleanUpList.isEmpty()) {
            stringBuffer2.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");
            stringBuffer2.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");
            stringBuffer2.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");
            stringBuffer2.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");
            stringBuffer2.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\bI\\b|\\bme\\b|\\bmy\\b");
            string = string.replaceFirst("Copyright \u00d4\u00f8\u03a9 1998-2007 NexWave Solutions.", " ");
        } else {
            stringBuffer2.append("\\ba\\b");
            iterator = this.cleanUpList.iterator();
            while (iterator.hasNext()) {
                stringBuffer2.append("|\\b" + iterator.next() + "\\b");
            }
        }
        if (this.cleanUpPunctuation != null && !this.cleanUpPunctuation.isEmpty()) {
            stringBuffer3.append("\\u3002");
            iterator = this.cleanUpPunctuation.iterator();
            while (iterator.hasNext()) {
                stringBuffer3.append("|" + iterator.next());
            }
        }
        string = this.minimalClean(string, stringBuffer2, stringBuffer3);
        return string;
    }

    private String minimalClean(String string, StringBuffer stringBuffer, StringBuffer stringBuffer2) {
        String string2 = new String(stringBuffer2);
        string = string.replaceAll("\\s+", " ");
        string = string.replaceAll("->", " ");
        string = string.replaceAll("[$|%,;.':()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+", " ");
        string = string.replaceAll("[$,;.':()\\/*\"{}=!&+<>\\\\]", " ");
        string = string.replaceAll("\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D", " ");
        string = string.replaceAll("\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F", " ");
        string = string.replaceAll("\\u3013|\\u300C|\\u300D", " ");
        if (string2.length() > 0) {
            string = string.replaceAll(string2, " ");
        }
        string = string.replaceAll(stringBuffer.toString(), " ");
        string = string.replaceAll("[$|%,;.':()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+", " ");
        string = string.replaceAll("[$,;.':()\\/*\"{}=!&+<>\\\\]", " ");
        string = string.replaceAll("\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D", " ");
        string = string.replaceAll("\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F", " ");
        string = string.replaceAll("\\u3013|\\u300C|\\u300D", " ");
        if (string2.length() > 0) {
            string = string.replaceAll(string2, " ");
        }
        return string;
    }
}

