/*
 * Decompiled with CFR 0.152.
 */
package com.hankcs.hanlp.mining.word;

import com.hankcs.hanlp.algorithm.MaxHeap;
import com.hankcs.hanlp.mining.word.WordInfo;
import com.hankcs.hanlp.utility.LexiconUtility;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.TreeMap;
import java.util.regex.Pattern;

public class NewWordDiscover {
    private int max_word_len;
    private float min_freq;
    private float min_entropy;
    private float min_aggregation;
    private boolean filter;

    public NewWordDiscover() {
        this(4, 5.0E-5f, 0.4f, 1.2f, false);
    }

    public NewWordDiscover(int max_word_len, float min_freq, float min_entropy, float min_aggregation, boolean filter) {
        this.max_word_len = max_word_len;
        this.min_freq = min_freq;
        this.min_entropy = min_entropy;
        this.min_aggregation = min_aggregation;
        this.filter = filter;
    }

    public List<WordInfo> discover(BufferedReader reader, int size) throws IOException {
        String doc;
        TreeMap<String, WordInfo> word_cands = new TreeMap<String, WordInfo>();
        int totalLength = 0;
        Pattern delimiter = Pattern.compile("[\\s\\d,.<>/?:;'\"\\[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z\uff0c\u3002\u300a\u300b\u3001\uff1f\uff1a\uff1b\u201c\u201d\u2018\u2019\uff5b\uff5d\u3010\u3011\uff08\uff09\u2026\uffe5\uff01\u2014\u2504\uff0d]+");
        while ((doc = reader.readLine()) != null) {
            doc = delimiter.matcher(doc).replaceAll("\u0000");
            int docLength = doc.length();
            for (int i = 0; i < docLength; ++i) {
                int end = Math.min(i + 1 + this.max_word_len, docLength + 1);
                for (int j = i + 1; j < end; ++j) {
                    String word = doc.substring(i, j);
                    if (word.indexOf(0) >= 0) continue;
                    WordInfo info = (WordInfo)word_cands.get(word);
                    if (info == null) {
                        info = new WordInfo(word);
                        word_cands.put(word, info);
                    }
                    info.update(i == 0 ? (char)'\u0000' : doc.charAt(i - 1), j < docLength ? doc.charAt(j) : (char)'\u0000');
                }
            }
            totalLength += docLength;
        }
        for (WordInfo info : word_cands.values()) {
            info.computeProbabilityEntropy(totalLength);
        }
        for (WordInfo info : word_cands.values()) {
            info.computeAggregation(word_cands);
        }
        LinkedList wordInfoList = new LinkedList(word_cands.values());
        ListIterator listIterator = wordInfoList.listIterator();
        while (listIterator.hasNext()) {
            WordInfo info = (WordInfo)listIterator.next();
            if (!(info.text.trim().length() < 2 || info.p < this.min_freq || info.entropy < this.min_entropy || info.aggregation < this.min_aggregation) && (!this.filter || LexiconUtility.getFrequency(info.text) <= 0)) continue;
            listIterator.remove();
        }
        MaxHeap<WordInfo> topN = new MaxHeap<WordInfo>(size, new Comparator<WordInfo>(){

            @Override
            public int compare(WordInfo o1, WordInfo o2) {
                return Float.compare(o1.p, o2.p);
            }
        });
        topN.addAll(wordInfoList);
        return topN.toList();
    }

    public List<WordInfo> discover(String doc, int size) {
        try {
            return this.discover(new BufferedReader(new StringReader(doc)), size);
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}

