/*
 * Decompiled with CFR 0.152.
 */
package org.archive.modules.extractor;

import java.io.InputStream;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReplayInputStream;
import org.archive.io.SeekInputStream;
import org.archive.io.SeekReader;
import org.archive.io.SeekReaderCharSequence;
import org.archive.modules.CrawlURI;
import org.archive.modules.extractor.ContentExtractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.ms.Doc;

public class ExtractorDOC
extends ContentExtractor {
    private static final long serialVersionUID = 3L;
    private static Pattern PATTERN = Pattern.compile("HYPERLINK.*?\"(.*?)\"");
    private static Logger logger = Logger.getLogger("org.archive.crawler.extractor.ExtractorDOC");

    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        String mimeType = uri.getContentType();
        if (mimeType == null) {
            return false;
        }
        return mimeType.toLowerCase().startsWith("application/msword");
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    protected boolean innerExtract(CrawlURI curi) {
        int links = 0;
        InputStream contentStream = null;
        ReplayInputStream documentStream = null;
        SeekReader docReader = null;
        try {
            contentStream = curi.getRecorder().getContentReplayInputStream();
            if (contentStream == null) {
                boolean bl = false;
                return bl;
            }
            documentStream = new ReplayInputStream(contentStream);
            docReader = Doc.getText((SeekInputStream)documentStream);
        }
        catch (Exception e) {
            curi.getNonFatalFailures().add(e);
            boolean bl = false;
            return bl;
        }
        finally {
            IOUtils.closeQuietly((InputStream)contentStream);
        }
        SeekReaderCharSequence cs = new SeekReaderCharSequence(docReader, 0);
        Matcher m = PATTERN.matcher((CharSequence)cs);
        while (m.find()) {
            ++links;
            this.addLink(curi, m.group(1));
        }
        documentStream.destroy();
        logger.fine(curi + " has " + links + " links.");
        return true;
    }

    private void addLink(CrawlURI curi, String hyperlink) {
        try {
            UURI dest = UURIFactory.getInstance((UURI)curi.getUURI(), (String)hyperlink);
            LinkContext lc = LinkContext.NAVLINK_MISC;
            this.addOutlink(curi, hyperlink, lc, Hop.NAVLINK);
        }
        catch (URIException e1) {
            this.logUriError(e1, curi.getUURI(), hyperlink);
        }
        this.numberOfLinksExtracted.incrementAndGet();
    }
}

