/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.io.Writer;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

public class ChineseDocumentToSentenceProcessor
implements Serializable {
    private static Redwood.RedwoodChannels log = Redwood.channels(ChineseDocumentToSentenceProcessor.class);
    private static final long serialVersionUID = 4054964767812217460L;
    private static final Set<Character> fullStopsSet = Generics.newHashSet(Arrays.asList(Character.valueOf('\u3002'), Character.valueOf('\uff01'), Character.valueOf('\uff1f'), Character.valueOf('!'), Character.valueOf('?')));
    private static final Set<Character> rightMarkSet = Generics.newHashSet(Arrays.asList(Character.valueOf('\u201d'), Character.valueOf('\u2019'), Character.valueOf('\u300b'), Character.valueOf('\u300f'), Character.valueOf('\u3009'), Character.valueOf('\u300d'), Character.valueOf('\uff1e'), Character.valueOf('\uff07'), Character.valueOf('\uff09'), Character.valueOf('\''), Character.valueOf('\"'), Character.valueOf(')'), Character.valueOf(']'), Character.valueOf('>')));
    private static final String encoding = "UTF-8";
    private final List<Pair<String, String>> normalizationTable;
    private static final Pattern PAIR_PATTERN = Pattern.compile("([^\\s]+)\\s+([^\\s]+)");
    private static final Pattern WHITEPLUS_PATTERN = Pattern.compile("[\\s\\p{Zs}]+");
    private static final Pattern START_WHITEPLUS_PATTERN = Pattern.compile("^[\\s\\p{Zs}]+");
    private static final Pattern END_WHITEPLUS_PATTERN = Pattern.compile("[\\s\\p{Zs}]+$");

    public ChineseDocumentToSentenceProcessor() {
        this(null);
    }

    public ChineseDocumentToSentenceProcessor(String normalizationTableFile) {
        if (normalizationTableFile != null) {
            this.normalizationTable = new ArrayList<Pair<String, String>>();
            for (String line : ObjectBank.getLineIterator(new File(normalizationTableFile), encoding)) {
                Matcher pairMatcher = PAIR_PATTERN.matcher(line);
                if (pairMatcher.find()) {
                    this.normalizationTable.add(new Pair<String, String>(pairMatcher.group(1), pairMatcher.group(2)));
                    continue;
                }
                log.info("Didn't match: " + line);
            }
        } else {
            this.normalizationTable = null;
        }
    }

    public String normalization(String in) {
        String norm = ChineseUtils.normalize(in);
        String out2 = this.normalize(norm);
        return out2;
    }

    private String normalize(String inputString) {
        if (this.normalizationTable == null) {
            return inputString;
        }
        Pattern replacePattern = WHITEPLUS_PATTERN;
        Matcher replaceMatcher = replacePattern.matcher(inputString);
        inputString = replaceMatcher.replaceAll(" ");
        for (Pair<String, String> p : this.normalizationTable) {
            replacePattern = Pattern.compile(p.first(), 16);
            replaceMatcher = replacePattern.matcher(inputString);
            String escape = p.second();
            if (escape.equals("$")) {
                escape = "\\$";
            }
            inputString = replaceMatcher.replaceAll(escape);
        }
        return inputString;
    }

    public static void main(String[] args) throws Exception {
        Properties props = StringUtils.argsToProperties(args);
        boolean alwaysAddS = props.containsKey("alwaysAddS");
        if (!props.containsKey("file")) {
            log.info("usage: java ChineseDocumentToSentenceProcessor [-segmentIBM] -file filename [-encoding encoding]");
            return;
        }
        ChineseDocumentToSentenceProcessor cp = new ChineseDocumentToSentenceProcessor();
        if (props.containsKey("encoding")) {
            Object[] objectArray = new Object[1];
            objectArray[0] = "WARNING: for now the default encoding is " + encoding + ". It's not changeable for now";
            log.info(objectArray);
        }
        String input = IOUtils.slurpFileNoExceptions(props.getProperty("file"), encoding);
        if (props.containsKey("segmentIBM")) {
            WhitespaceTokenizer<Word> tok = WhitespaceTokenizer.newWordWhitespaceTokenizer(new StringReader(input), true);
            String parseInside = props.getProperty("parseInside");
            if (parseInside == null) {
                parseInside = "";
            }
            PrintWriter pw = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, encoding), true);
            StringBuilder buff = new StringBuilder();
            StringBuilder sgmlbuff = new StringBuilder();
            String lastSgml = "";
            Pattern p1 = Pattern.compile("<.*>");
            Pattern p2 = Pattern.compile("\ufeff?<[\\p{Alpha}]+");
            Pattern p3 = Pattern.compile("[A-Za-z0-9=\"]+>");
            Pattern p4 = Pattern.compile("<(?:" + parseInside + ")[ >]");
            boolean inSGML = false;
            int splitItems = 0;
            int numAdded = 0;
            while (tok.hasNext()) {
                String s = ((Word)tok.next()).word();
                if (p2.matcher(s).matches()) {
                    inSGML = true;
                    sgmlbuff.append(s).append(" ");
                    continue;
                }
                if (p1.matcher(s).matches() || inSGML && p3.matcher(s).matches() || "\n".equals(s)) {
                    inSGML = false;
                    if (buff.toString().trim().length() > 0) {
                        boolean processIt = false;
                        if (parseInside.equals("")) {
                            processIt = true;
                        } else if (p4.matcher(lastSgml).find()) {
                            processIt = true;
                        }
                        if (processIt) {
                            List<String> sents = ChineseDocumentToSentenceProcessor.fromPlainText(buff.toString(), true);
                            if (alwaysAddS || sents.size() > 1) {
                                int i = 1;
                                for (String str : sents) {
                                    pw.print("<s id=\"" + i + "\">");
                                    pw.print(str);
                                    pw.println("</s>");
                                    ++i;
                                }
                                if (sents.size() > 1) {
                                    ++splitItems;
                                    numAdded += sents.size() - 1;
                                }
                            } else if (sents.size() == 1) {
                                pw.print(sents.get(0));
                            }
                        } else {
                            pw.print(buff);
                        }
                        buff = new StringBuilder();
                    }
                    sgmlbuff.append(s);
                    pw.print(sgmlbuff);
                    lastSgml = sgmlbuff.toString();
                    sgmlbuff = new StringBuilder();
                    continue;
                }
                if (inSGML) {
                    sgmlbuff.append(s).append(" ");
                    continue;
                }
                buff.append(s).append(" ");
            }
            pw.flush();
            pw.close();
            log.info("Split " + splitItems + " segments, adding " + numAdded + " sentences.");
        } else {
            List<String> sent = ChineseDocumentToSentenceProcessor.fromHTML(input);
            PrintWriter pw = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.err, encoding), true);
            for (String a : sent) {
                pw.println(a);
            }
        }
    }

    public static List<String> fromHTML(String inputString) throws IOException {
        ArrayList<String> ans = new ArrayList<String>();
        MyHTMLParser parser = new MyHTMLParser();
        List<String> sents = parser.parse(inputString);
        for (String s : sents) {
            ans.addAll(ChineseDocumentToSentenceProcessor.fromPlainText(s));
        }
        return ans;
    }

    public static List<String> fromPlainText(String contentString) throws IOException {
        return ChineseDocumentToSentenceProcessor.fromPlainText(contentString, false);
    }

    public static List<String> fromPlainText(String contentString, boolean segmented) throws IOException {
        contentString = segmented ? ChineseUtils.normalize(contentString, 0, 1) : ChineseUtils.normalize(contentString, 2, 1);
        String sentenceString = "";
        char[] content = contentString.toCharArray();
        boolean sentenceEnd = false;
        ArrayList<String> sentenceList = new ArrayList<String>();
        int lastCh = -1;
        char[] cArray = content;
        int n = cArray.length;
        for (int i = 0; i < n; ++i) {
            Character c = Character.valueOf(cArray[i]);
            String newChar = c.toString();
            if (!sentenceEnd) {
                if (segmented && fullStopsSet.contains(c) && (lastCh == -1 || Character.isSpaceChar(lastCh))) {
                    sentenceString = sentenceString + newChar;
                    sentenceEnd = true;
                } else if (!segmented && fullStopsSet.contains(c)) {
                    sentenceString = sentenceString + newChar;
                    sentenceEnd = true;
                } else {
                    sentenceString = sentenceString + newChar;
                }
            } else if (rightMarkSet.contains(c)) {
                sentenceString = sentenceString + newChar;
            } else if (newChar.matches("\\s")) {
                sentenceString = sentenceString + newChar;
            } else if (fullStopsSet.contains(c)) {
                sentenceString = sentenceString + newChar;
            } else {
                if (sentenceString.length() > 0) {
                    sentenceEnd = false;
                }
                if ((sentenceString = ChineseDocumentToSentenceProcessor.removeWhitespace(sentenceString, segmented)).length() > 0) {
                    sentenceList.add(sentenceString);
                }
                sentenceString = "";
                sentenceString = sentenceString + newChar;
            }
            lastCh = c.charValue();
        }
        if ((sentenceString = ChineseDocumentToSentenceProcessor.removeWhitespace(sentenceString, segmented)).length() > 0) {
            sentenceList.add(sentenceString);
        }
        return sentenceList;
    }

    private static String removeWhitespace(String str, boolean segmented) {
        if (str.length() > 0) {
            Pattern replacePattern = START_WHITEPLUS_PATTERN;
            Matcher replaceMatcher = replacePattern.matcher(str);
            str = replaceMatcher.replaceAll("");
            replacePattern = END_WHITEPLUS_PATTERN;
            replaceMatcher = replacePattern.matcher(str);
            str = replaceMatcher.replaceAll("");
            if (!segmented) {
                replacePattern = WHITEPLUS_PATTERN;
                replaceMatcher = replacePattern.matcher(str);
                str = replaceMatcher.replaceAll("");
            }
        }
        return str;
    }

    static class MyHTMLParser
    extends HTMLEditorKit.ParserCallback {
        protected StringBuilder textBuffer;
        protected List<String> sentences;
        protected String title = "";
        protected boolean isTitle = false;
        protected boolean isBody = false;
        protected boolean isScript = false;
        protected boolean isBreak = false;

        @Override
        public void handleText(char[] data, int pos) {
            if (data.length == 0) {
                return;
            }
            if (this.isTitle) {
                this.title = new String(data);
            } else if (!this.isBody || !this.isScript) {
                // empty if block
            }
            this.textBuffer.append(data);
            String text = this.textBuffer.toString();
            text = text.replaceAll("\u00a0", "");
            text = text.trim();
            if (text.length() == 0) {
                return;
            }
            this.sentences.add(text);
            this.textBuffer = new StringBuilder(500);
        }

        @Override
        public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) {
            if (tag == HTML.Tag.TITLE) {
                this.isTitle = true;
            } else if (tag == HTML.Tag.BODY) {
                this.isBody = true;
            } else if (tag == HTML.Tag.SCRIPT) {
                this.isScript = true;
            }
            this.isBreak = tag.breaksFlow();
        }

        @Override
        public void handleEndTag(HTML.Tag tag, int pos) {
            if (tag == HTML.Tag.TITLE) {
                this.isTitle = false;
            } else if (tag == HTML.Tag.BODY) {
                this.isBody = false;
            } else if (tag == HTML.Tag.SCRIPT) {
                this.isScript = false;
            }
        }

        public List<String> parse(URL url) throws IOException {
            return this.parse(IOUtils.slurpURL(url));
        }

        public List<String> parse(Reader r) throws IOException {
            return this.parse(IOUtils.slurpReader(r));
        }

        public List<String> parse(String text) throws IOException {
            text = text.replaceAll("/>", ">");
            text = text.replaceAll("<\\?", "<");
            StringReader r = new StringReader(text);
            this.textBuffer = new StringBuilder(200);
            this.sentences = new ArrayList<String>();
            new ParserDelegator().parse(r, this, true);
            return this.sentences;
        }

        public String title() {
            return this.title;
        }
    }
}

