/*
 * Decompiled with CFR 0.152.
 */
package it.unimi.di.law.bubing.frontier;

import it.unimi.di.law.bubing.RuntimeConfiguration;
import it.unimi.di.law.bubing.frontier.Frontier;
import it.unimi.di.law.bubing.parser.HTMLParser;
import it.unimi.di.law.bubing.parser.Parser;
import it.unimi.di.law.bubing.parser.SpamTextProcessor;
import it.unimi.di.law.bubing.store.Store;
import it.unimi.di.law.bubing.util.BURL;
import it.unimi.di.law.bubing.util.Link;
import it.unimi.di.law.bubing.util.URLRespectsRobots;
import it.unimi.di.law.bubing.util.Util;
import it.unimi.di.law.warc.filters.Filter;
import it.unimi.di.law.warc.records.HttpResponseWarcRecord;
import it.unimi.dsi.fastutil.bytes.ByteArrayList;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.shorts.Short2ShortMap;
import java.io.IOException;
import java.net.NoRouteToHostException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URI;
import java.net.UnknownHostException;
import java.nio.BufferOverflowException;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import javax.net.ssl.SSLPeerUnverifiedException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.ConnectionClosedException;
import org.apache.http.MalformedChunkCodingException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.TruncatedChunkException;
import org.apache.http.client.CircularRedirectException;
import org.apache.http.client.RedirectException;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.conn.HttpHostConnectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ParsingThread
extends Thread {
    private static final Logger LOGGER = LoggerFactory.getLogger(ParsingThread.class);
    protected static final Object2LongOpenHashMap<Class<?>> EXCEPTION_TO_WAIT_TIME = new Object2LongOpenHashMap();
    protected static final Object2IntOpenHashMap<Class<?>> EXCEPTION_TO_MAX_RETRIES = new Object2IntOpenHashMap();
    protected static final ObjectOpenHashSet<Class<?>> EXCEPTION_HOST_KILLER = new ObjectOpenHashSet();
    public volatile boolean stop;
    private final NumberFormat formatDouble = new DecimalFormat("#,##0.00");
    private final Frontier frontier;
    private final Store store;
    public final ArrayList<Parser<?>> parsers;

    public ParsingThread(Frontier frontier, Store store, int index) {
        this.setName(this.getClass().getSimpleName() + '-' + index);
        this.frontier = frontier;
        this.store = store;
        this.parsers = new ArrayList(frontier.rc.parsers.size());
        for (Parser<?> parser : frontier.rc.parsers) {
            this.parsers.add((Parser<?>)parser.copy());
        }
        this.setPriority(3);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     * Unable to fully structure code
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     * Converted monitor instructions to comments
     * Lifted jumps to return sites
     */
    @Override
    public void run() {
        try {
            rc = this.frontier.rc;
            frontierLinkReceiver = new FrontierEnqueuer(this.frontier, rc);
            while (true) lbl-1000:
            // 4 sources

            {
                block69: {
                    block67: {
                        block66: {
                            rc.ensureNotPaused();
                            i = 0;
                            while ((fetchData = this.frontier.results.poll()) == null) {
                                rc.ensureNotPaused();
                                if (this.stop) {
                                    return;
                                }
                                Thread.sleep(1 << Math.min(i, 10));
                                ++i;
                            }
                            visitState = fetchData.visitState;
                            if (ParsingThread.LOGGER.isTraceEnabled()) {
                                ParsingThread.LOGGER.trace("Got fetched response for visit state " + visitState);
                            }
                            entrySize = visitState.workbenchEntry.size();
                            ipDelay = rc.ipDelay;
                            knownCount = this.frontier.agent.getKnownCount();
                            if (knownCount > 1 && rc.ipDelayFactor != 0.0) {
                                ipDelay = Math.max(ipDelay, (long)((double)rc.ipDelay * rc.ipDelayFactor * (double)this.frontier.agent.getKnownCount() * (double)entrySize / ((double)entrySize + 1.0)));
                            }
                            visitState.workbenchEntry.nextFetch = fetchData.endTime + ipDelay;
                            if (fetchData.exception == null) break block66;
                            ParsingThread.LOGGER.warn("Exception while fetching " + fetchData.uri(), fetchData.exception);
                            exceptionClass = fetchData.exception.getClass();
                            if (visitState.lastExceptionClass != exceptionClass) {
                                if (visitState.lastExceptionClass == null) {
                                    this.frontier.brokenVisitStates.incrementAndGet();
                                    visitState.retries = 0;
                                }
                                visitState.lastExceptionClass = exceptionClass;
                            } else {
                                ++visitState.retries;
                            }
                            if (visitState.retries < ParsingThread.EXCEPTION_TO_MAX_RETRIES.getInt(exceptionClass)) {
                                delay = ParsingThread.EXCEPTION_TO_WAIT_TIME.getLong(exceptionClass) << visitState.retries;
                                visitState.nextFetch = fetchData.endTime + delay;
                                ParsingThread.LOGGER.info("Will retry URL " + fetchData.uri() + " of visit state " + visitState + " for " + exceptionClass.getSimpleName() + " with delay " + delay);
                            } else {
                                this.frontier.brokenVisitStates.decrementAndGet();
                                if (ParsingThread.EXCEPTION_HOST_KILLER.contains(exceptionClass) || fetchData.robots) {
                                    visitState.schedulePurge();
                                    ParsingThread.LOGGER.warn("Visit state " + visitState + " killed by " + exceptionClass.getSimpleName() + " (URL: " + fetchData.uri() + ")");
                                } else {
                                    visitState.dequeue();
                                    visitState.lastExceptionClass = null;
                                    visitState.nextFetch = fetchData.endTime + rc.schemeAuthorityDelay;
                                    ParsingThread.LOGGER.info("URL " + fetchData.uri() + " killed by " + exceptionClass.getSimpleName());
                                }
                            }
                            delay = fetchData;
                            // MONITORENTER : delay
                            fetchData.inUse = false;
                            fetchData.notify();
                            // MONITOREXIT : delay
                            continue;
                        }
                        firstPath /* !! */  = visitState.dequeue();
                        if (ParsingThread.LOGGER.isTraceEnabled()) {
                            ParsingThread.LOGGER.trace("Dequeuing " + Util.toString(firstPath /* !! */ ) + " after fetching " + fetchData.uri() + "; " + (visitState.isEmpty() != false ? "visit state is now empty " : " first path now is " + Util.toString(visitState.firstPath())));
                        }
                        visitState.nextFetch = fetchData.endTime + rc.schemeAuthorityDelay;
                        if (visitState.lastExceptionClass != null) {
                            this.frontier.brokenVisitStates.decrementAndGet();
                        }
                        visitState.lastExceptionClass = null;
                        if (!fetchData.robots) break block67;
                        this.frontier.fetchedRobots.incrementAndGet();
                        this.frontier.robotsWarcParallelOutputStream.write(new HttpResponseWarcRecord(fetchData.uri(), fetchData.response()));
                        visitState.robotsFilter = URLRespectsRobots.parseRobotsResponse(fetchData, rc.userAgent);
                        if (visitState.robotsFilter == null) {
                            visitState.schedulePurge();
                            ParsingThread.LOGGER.warn("Visit state " + visitState + " killed by null robots.txt");
                        }
                        visitState.lastRobotsFetch = fetchData.endTime;
                        v0 = fetchData;
                        firstPath /* !! */  = (byte[])v0;
                        // MONITORENTER : v0
                        fetchData.inUse = false;
                        fetchData.notify();
                        // MONITOREXIT : firstPath /* !! */ 
                        continue;
                    }
                    url = fetchData.uri();
                    this.frontier.fetchedResources.incrementAndGet();
                    digest = null;
                    guessedCharset = null;
                    linkReceiver = rc.followFilter.apply(fetchData) != false ? new HTMLParser.SetLinkReceiver() : Parser.NULL_LINK_RECEIVER;
                    frontierLinkReceiver.init(fetchData.uri(), visitState.schemeAuthority, visitState.robotsFilter);
                    streamLength = fetchData.response().getEntity().getContentLength();
                    locationHeader = fetchData.response().getFirstHeader("Location");
                    if (locationHeader != null && (location = BURL.parse(locationHeader.getValue())) != null) {
                        if (!location.isAbsolute() && ParsingThread.LOGGER.isDebugEnabled()) {
                            ParsingThread.LOGGER.debug("Found relative header location URL: \"{}\"", (Object)location);
                        }
                        linkReceiver.location(fetchData.uri().resolve(location));
                    }
                    try {
                        if (rc.parseFilter.apply(fetchData)) {
                            parserFound = false;
                            for (Parser<?> parser : this.parsers) {
                                block68: {
                                    if (!parser.apply(fetchData)) continue;
                                    parserFound = true;
                                    try {
                                        digest = parser.parse(fetchData.uri(), fetchData.response(), linkReceiver);
                                        if (rc.spamDetector == null || visitState.termCountUpdates >= rc.spamDetectionThreshold && rc.spamDetectionPeriodicity == 0x7FFFFFFF) break block68;
                                        result = parser.result();
                                        if (result instanceof SpamTextProcessor.TermCount) {
                                            visitState.updateTermCount((Short2ShortMap)((SpamTextProcessor.TermCount)result));
                                        }
                                        if ((visitState.termCountUpdates - rc.spamDetectionThreshold) % rc.spamDetectionPeriodicity == 0) {
                                            visitState.spammicity = (float)rc.spamDetector.estimate(visitState.termCount);
                                            ParsingThread.LOGGER.info("Spammicity for " + visitState + ": " + visitState.spammicity + " (" + visitState.termCountUpdates + " updates)");
                                        }
                                    }
                                    catch (BufferOverflowException e) {
                                        ParsingThread.LOGGER.warn("Buffer overflow during parsing of " + url + " with " + parser);
                                    }
                                    catch (IOException e) {
                                        ParsingThread.LOGGER.warn("An exception occurred while parsing " + url + " with " + parser, (Throwable)e);
                                    }
                                }
                                guessedCharset = parser.guessedCharset();
                                break;
                            }
                            if (!parserFound) {
                                ParsingThread.LOGGER.info("I'm not parsing page " + url + " because I could not find a suitable parser");
                            }
                            this.frontier.outdegree.add((double)linkReceiver.size());
                            currentHost = url.getHost();
                            currentOutHostDegree = 0;
                            for (URI u : linkReceiver) {
                                if (currentHost.equals(u.getHost())) continue;
                                ++currentOutHostDegree;
                            }
                            this.frontier.externalOutdegree.add((double)currentOutHostDegree);
                            break block69;
                        }
                        if (ParsingThread.LOGGER.isDebugEnabled()) {
                            ParsingThread.LOGGER.debug("I'm not parsing page " + url);
                        }
                    }
                    catch (Exception e) {
                        ParsingThread.LOGGER.warn("Exception during parsing of " + url, (Throwable)e);
                    }
                }
                mustBeStored = rc.storeFilter.apply(fetchData);
                if (digest == null) {
                    if (streamLength != 0L && ParsingThread.LOGGER.isDebugEnabled()) {
                        ParsingThread.LOGGER.debug("Computing binary digest for " + url);
                    }
                    digest = fetchData.binaryParser.parse(fetchData.uri(), fetchData.response(), null);
                }
                v1 = isNotDuplicate = streamLength == 0L || this.frontier.digests.addHash(digest) != false;
                if (ParsingThread.LOGGER.isTraceEnabled()) {
                    ParsingThread.LOGGER.trace("Decided that for {} isNotDuplicate={}", (Object)url, (Object)isNotDuplicate);
                }
                if (isNotDuplicate) {
                    for (URI u : linkReceiver) {
                        frontierLinkReceiver.enqueue(u);
                    }
                } else {
                    fetchData.isDuplicate(true);
                }
                if (mustBeStored) {
                    if (isNotDuplicate) {
                        if (this.frontier.schemeAuthority2Count.addTo(visitState.schemeAuthority, 1) >= rc.maxUrlsPerSchemeAuthority - 1) {
                            ParsingThread.LOGGER.info("Reached maximum number of URLs for scheme+authority " + Util.toString(visitState.schemeAuthority));
                            visitState.schedulePurge();
                        }
                        if ((code = fetchData.response().getStatusLine().getStatusCode() / 100) > 0 && code < 6) {
                            this.frontier.archetypesStatus[code].incrementAndGet();
                        } else {
                            this.frontier.archetypesStatus[0].incrementAndGet();
                        }
                        if (streamLength >= 0L) {
                            this.frontier.contentLength.add((double)streamLength);
                        }
                        if ((contentTypeHeader = fetchData.response().getEntity().getContentType()) != null) {
                            contentType = contentTypeHeader.getValue();
                            if (StringUtils.startsWithIgnoreCase((String)contentType, (String)"text")) {
                                this.frontier.contentTypeText.incrementAndGet();
                            } else if (StringUtils.startsWithIgnoreCase((String)contentType, (String)"image")) {
                                this.frontier.contentTypeImage.incrementAndGet();
                            } else if (StringUtils.startsWithIgnoreCase((String)contentType, (String)"application")) {
                                this.frontier.contentTypeApplication.incrementAndGet();
                            } else {
                                this.frontier.contentTypeOthers.incrementAndGet();
                            }
                        }
                        result = "stored";
                    } else {
                        this.frontier.duplicates.incrementAndGet();
                        result = "duplicate";
                    }
                    this.store.store(fetchData.uri(), fetchData.response(), isNotDuplicate == false, digest, guessedCharset);
                } else {
                    result = "not stored";
                }
                if (ParsingThread.LOGGER.isDebugEnabled()) {
                    ParsingThread.LOGGER.debug("Fetched " + url + " (" + it.unimi.dsi.Util.formatSize((long)((long)(1000.0 * (double)fetchData.length() / (double)(fetchData.endTime - fetchData.startTime + 1L))), (NumberFormat)this.formatDouble) + "B/s; " + frontierLinkReceiver.scheduledLinks + "/" + frontierLinkReceiver.outlinks + "; " + result + ")");
                }
                var4_6 = fetchData;
                break;
            }
            catch (Throwable var23_34) {
                var24_35 = fetchData;
                // MONITORENTER : var24_35
                fetchData.inUse = false;
                fetchData.notify();
                // MONITOREXIT : var24_35
                throw var23_34;
            }
            {
                // MONITORENTER : var4_6
                fetchData.inUse = false;
                fetchData.notify();
                // MONITOREXIT : var4_6
                ** while (true)
            }
        }
        catch (InterruptedException e) {
            if (ParsingThread.LOGGER.isDebugEnabled() == false) return;
            ParsingThread.LOGGER.debug(this + " was interrupted");
            return;
        }
        catch (Throwable t) {
            ParsingThread.LOGGER.error("Unexpected exception", t);
        }
    }

    static {
        EXCEPTION_TO_WAIT_TIME.defaultReturnValue(TimeUnit.HOURS.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(NoRouteToHostException.class, TimeUnit.HOURS.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(SocketException.class, TimeUnit.MINUTES.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(SocketTimeoutException.class, TimeUnit.MINUTES.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(UnknownHostException.class, TimeUnit.HOURS.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(SSLPeerUnverifiedException.class, TimeUnit.HOURS.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(CircularRedirectException.class, 0L);
        EXCEPTION_TO_WAIT_TIME.put(RedirectException.class, 0L);
        EXCEPTION_TO_WAIT_TIME.put(ConnectTimeoutException.class, TimeUnit.HOURS.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(ConnectionClosedException.class, TimeUnit.MINUTES.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(HttpHostConnectException.class, TimeUnit.HOURS.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(NoHttpResponseException.class, TimeUnit.MINUTES.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(TruncatedChunkException.class, TimeUnit.MINUTES.toMillis(1L));
        EXCEPTION_TO_WAIT_TIME.put(MalformedChunkCodingException.class, TimeUnit.MINUTES.toMillis(1L));
        EXCEPTION_TO_MAX_RETRIES.defaultReturnValue(5);
        EXCEPTION_TO_MAX_RETRIES.put(UnknownHostException.class, 2);
        EXCEPTION_TO_MAX_RETRIES.put(SSLPeerUnverifiedException.class, 0);
        EXCEPTION_TO_MAX_RETRIES.put(CircularRedirectException.class, 0);
        EXCEPTION_TO_MAX_RETRIES.put(RedirectException.class, 0);
        EXCEPTION_TO_MAX_RETRIES.put(ConnectTimeoutException.class, 2);
        EXCEPTION_TO_MAX_RETRIES.put(ConnectionClosedException.class, 2);
        EXCEPTION_TO_MAX_RETRIES.put(NoHttpResponseException.class, 2);
        EXCEPTION_TO_MAX_RETRIES.put(TruncatedChunkException.class, 1);
        EXCEPTION_TO_MAX_RETRIES.put(MalformedChunkCodingException.class, 1);
        EXCEPTION_HOST_KILLER.add(NoRouteToHostException.class);
        EXCEPTION_HOST_KILLER.add(UnknownHostException.class);
        EXCEPTION_HOST_KILLER.add(SocketException.class);
        EXCEPTION_HOST_KILLER.add(SSLPeerUnverifiedException.class);
        EXCEPTION_HOST_KILLER.add(ConnectTimeoutException.class);
    }

    protected static final class FrontierEnqueuer {
        private static final boolean ASSERTS = false;
        private final Frontier frontier;
        private final Filter<Link> scheduleFilter;
        private byte[] schemeAuthority;
        private URI uri;
        private char[][] robotsFilter;
        private final ByteArrayList byteList;
        public int outlinks;
        public int scheduledLinks;

        public FrontierEnqueuer(Frontier frontier, RuntimeConfiguration rc) {
            this.frontier = frontier;
            this.scheduleFilter = rc.scheduleFilter;
            this.byteList = new ByteArrayList();
        }

        public void init(URI uri, byte[] schemeAuthority, char[][] robotsFilter) {
            this.outlinks = 0;
            this.scheduledLinks = 0;
            this.uri = uri;
            this.schemeAuthority = schemeAuthority;
            this.robotsFilter = robotsFilter;
        }

        private static boolean sameSchemeAuthority(byte[] schemeAuthority, URI url) {
            String scheme = url.getScheme();
            int schemeLength = scheme.length();
            if (schemeAuthority.length < schemeLength + 3) {
                return false;
            }
            int i = schemeLength;
            while (i-- != 0) {
                if (schemeAuthority[i] == (byte)scheme.charAt(i)) continue;
                return false;
            }
            if (schemeAuthority[schemeLength++] != 58) {
                return false;
            }
            if (schemeAuthority[schemeLength++] != 47) {
                return false;
            }
            if (schemeAuthority[schemeLength++] != 47) {
                return false;
            }
            String authority = url.getRawAuthority();
            if (schemeAuthority.length != schemeLength + authority.length()) {
                return false;
            }
            int i2 = authority.length();
            while (i2-- != 0) {
                if (schemeAuthority[schemeLength + i2] == (byte)authority.charAt(i2)) continue;
                return false;
            }
            return true;
        }

        public void enqueue(URI url) {
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Analyzing " + url + " for enqueuing");
            }
            ++this.outlinks;
            if (!this.scheduleFilter.apply(new Link(this.uri, url))) {
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("I'm not scheduling URL " + url + ": not accepted by scheduleFilter");
                }
                return;
            }
            Lock lock = this.frontier.rc.blackListedHostHashesLock.readLock();
            lock.lock();
            try {
                if (this.frontier.rc.blackListedHostHashes.contains(url.getHost().hashCode())) {
                    if (LOGGER.isDebugEnabled()) {
                        LOGGER.debug("I'm not scheduling URL " + url + ": host " + url.getHost() + " is blacklisted");
                    }
                    return;
                }
            }
            finally {
                lock.unlock();
            }
            boolean sameSchemeAuthority = FrontierEnqueuer.sameSchemeAuthority(this.schemeAuthority, url);
            assert (Util.toString(this.schemeAuthority).equals(BURL.schemeAndAuthority(url)) == sameSchemeAuthority) : "(" + Util.toString(this.schemeAuthority) + ").equals(" + BURL.schemeAndAuthority(url) + ") != " + sameSchemeAuthority;
            if (this.robotsFilter == null) {
                LOGGER.error("Null robots filter for " + Util.toString(this.schemeAuthority));
            } else if (sameSchemeAuthority && !URLRespectsRobots.apply(this.robotsFilter, url)) {
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("I'm not scheduling URL " + url + ": forbidden by robots");
                }
                return;
            }
            try {
                if (LOGGER.isDebugEnabled()) {
                    LOGGER.debug("I'm scheduling URL " + url);
                }
                BURL.toByteArrayList(url, this.byteList);
                this.frontier.enqueue(this.byteList);
                ++this.scheduledLinks;
            }
            catch (Exception e) {
                LOGGER.error("Exception while enqueuing URL " + url, (Throwable)e);
                throw new RuntimeException(e);
            }
        }
    }
}

