/*
 * Decompiled with CFR 0.152.
 */
package de.tblsoft.solr.pipeline;

import com.google.common.base.Strings;
import de.tblsoft.solr.http.HTTPHelper;
import de.tblsoft.solr.pipeline.AbstractReader;
import de.tblsoft.solr.pipeline.bean.Document;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.io.IOUtils;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class XmlSitemapReader
extends AbstractReader {
    private List<String> sitemapBlacklits;
    private Long maxRows;
    private Long currentRow = 0L;

    @Override
    public void read() {
        this.sitemapBlacklits = this.getPropertyAsList("sitemapBlacklits", new ArrayList<String>());
        List<String> urls = this.getPropertyAsList("urls", new ArrayList<String>());
        List<String> sitemapIndexUrls = this.getPropertyAsList("sitemapIndexUrls", new ArrayList<String>());
        List<String> sitemapUrls = this.getPropertyAsList("sitemapUrls", new ArrayList<String>());
        this.maxRows = this.getPropertyAsInteger("maxRows", null);
        try {
            org.w3c.dom.Document doc;
            for (String sitemapUrl : sitemapUrls) {
                doc = this.getDomByUrl(sitemapUrl);
                this.parseSitemap(doc, null);
            }
            for (String url : sitemapIndexUrls) {
                if (this.sitemapBlacklits.contains(url)) continue;
                doc = this.getDomByUrl(url);
                this.processSitemapIndex(doc, url);
            }
            for (String url : urls) {
                List<String> sitemapList = this.readSitemapUrlsFromRobotsTxt(url);
                for (String sitemap : sitemapList) {
                    if (this.sitemapBlacklits.contains(sitemap)) continue;
                    org.w3c.dom.Document doc2 = this.getDomByUrl(sitemap);
                    this.parseSitemap(doc2, null);
                    this.processSitemapIndex(doc2, sitemap);
                }
            }
        }
        catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    private org.w3c.dom.Document getDomByUrl(String url) throws IOException, ParserConfigurationException, SAXException {
        String sitemapContent = HTTPHelper.get(url);
        InputStream is = IOUtils.toInputStream((String)sitemapContent, (String)"UTF-8");
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        org.w3c.dom.Document doc = dBuilder.parse(is);
        return doc;
    }

    List<String> readSitemapUrlsFromRobotsTxt(String domain) {
        String robots = HTTPHelper.get(domain);
        ArrayList<String> sitemapList = new ArrayList<String>();
        String sitemapPattern = "(?i)Sitemap: (.*)";
        Scanner scanner = new Scanner(robots);
        while (scanner.hasNextLine()) {
            String line = scanner.nextLine();
            if (!Pattern.matches(sitemapPattern, line)) continue;
            String sitemapUrl = line.replaceAll(sitemapPattern, "$1");
            sitemapList.add(sitemapUrl);
        }
        scanner.close();
        return sitemapList;
    }

    void processSitemapIndex(org.w3c.dom.Document doc, String sitemapIndexUrl) throws Exception {
        if (doc.getElementsByTagName("sitemap").getLength() == 0) {
            return;
        }
        ArrayList<String> sitemapUrls = new ArrayList<String>();
        try {
            NodeList locNodes = doc.getElementsByTagName("loc");
            for (int i = 0; i < locNodes.getLength(); ++i) {
                String loc = locNodes.item(i).getFirstChild().getNodeValue().trim();
                sitemapUrls.add(loc);
            }
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
        for (String loc : sitemapUrls) {
            org.w3c.dom.Document sitemapDoc = this.getDomByUrl(loc);
            this.parseSitemap(sitemapDoc, sitemapIndexUrl);
        }
    }

    void parseSitemap(org.w3c.dom.Document doc, String sitemapIndexUrl) {
        try {
            NodeList urls = doc.getElementsByTagName("url");
            for (int i = 0; i < urls.getLength(); ++i) {
                Node node = urls.item(i);
                this.parseSitemapUrlNode(node.getChildNodes(), sitemapIndexUrl);
            }
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    void parseSitemapUrlNode(NodeList url, String sitemapIndexUrl) {
        Document document = new Document();
        if (!Strings.isNullOrEmpty((String)sitemapIndexUrl)) {
            document.addField("sitemapIndexUrl", sitemapIndexUrl);
        }
        for (int k = 0; k < url.getLength(); ++k) {
            Node noder = url.item(k);
            if (1 != noder.getNodeType() || noder.getFirstChild() == null) continue;
            String name = noder.getNodeName();
            String value = noder.getFirstChild().getNodeValue();
            document.addField(name, value);
        }
        if (this.maxRows == null || this.currentRow <= this.maxRows) {
            this.executer.document(document);
            Long l = this.currentRow;
            Long l2 = this.currentRow = Long.valueOf(this.currentRow + 1L);
        }
    }
}

