/*
 * Decompiled with CFR 0.152.
 */
package edu.umd.cloud9.webgraph;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.WebDocument;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.CollectionConfigurationManager;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import edu.umd.cloud9.webgraph.normalizer.AnchorTextNormalizer;
import java.io.IOException;
import java.io.UTFDataFormatException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class TrecExtractLinks
extends PowerTool {
    private static final Logger LOG = Logger.getLogger(TrecExtractLinks.class);
    public static final String[] RequiredParameters = new String[]{"Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers", "Cloud9.IncludeInternalLinks", "Cloud9.AnchorTextNormalizer", "Cloud9.DocnoMappingClass", "Cloud9.DocnoMappingFile"};
    CollectionConfigurationManager configer;

    @Override
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public TrecExtractLinks(Configuration conf) {
        super(conf);
    }

    public TrecExtractLinks(Configuration conf, CollectionConfigurationManager confer) {
        super(conf);
        this.configer = confer;
    }

    @Override
    public int runTool() throws Exception {
        Configuration conf = this.getConf();
        conf.set("mapred.child.java.opts", "-Xmx3072m");
        conf.setInt("mapred.task.timeout", 60000000);
        Job job = new Job(conf);
        int numReducers = conf.getInt("Cloud9.Reducers", 200);
        String inputPath = conf.get("Cloud9.InputPath");
        String outputPath = conf.get("Cloud9.OutputPath");
        String mappingFile = conf.get("Cloud9.DocnoMappingFile");
        FileSystem fs = FileSystem.get((Configuration)conf);
        if (!fs.exists(new Path(mappingFile))) {
            throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
        }
        DistributedCache.addCacheFile((URI)new Path(mappingFile).toUri(), (Configuration)job.getConfiguration());
        job.setJobName("ExtractLinks");
        job.setNumReduceTasks(numReducers);
        job.setJarByClass(TrecExtractLinks.class);
        job.setMapperClass(Map.class);
        job.setCombinerClass(Reduce.class);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);
        this.configer.applyJobConfig(job);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput((Job)job, (boolean)true);
        SequenceFileOutputFormat.setOutputCompressionType((Job)job, (SequenceFile.CompressionType)SequenceFile.CompressionType.BLOCK);
        TrecExtractLinks.recursivelyAddInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath((Job)job, (Path)new Path(outputPath));
        LOG.info((Object)"ExtractLinks");
        LOG.info((Object)(" - input path: " + inputPath));
        LOG.info((Object)(" - output path: " + outputPath));
        LOG.info((Object)(" - mapping file: " + mappingFile));
        LOG.info((Object)(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)));
        job.waitForCompletion(true);
        return 0;
    }

    public static void recursivelyAddInputPaths(Job job, String path) throws IOException {
        FileStatus[] ls;
        FileSystem fs;
        try {
            fs = FileSystem.get((URI)new URI(path), (Configuration)job.getConfiguration());
        }
        catch (URISyntaxException e) {
            throw new RuntimeException("Error recursively adding path -- " + path);
        }
        for (FileStatus status : ls = fs.listStatus(new Path(path))) {
            if (status.getPath().getName().startsWith("_")) continue;
            if (status.isDir()) {
                TrecExtractLinks.recursivelyAddInputPaths(job, status.getPath().toString());
                continue;
            }
            FileInputFormat.addInputPath((Job)job, (Path)status.getPath());
        }
    }

    public static class Reduce
    extends Reducer<Text, ArrayListWritable<AnchorText>, Text, ArrayListWritable<AnchorText>> {
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable();
        private static boolean pushed;

        public void reduce(Text key, Iterable<ArrayListWritable<AnchorText>> values, Reducer.Context context) throws IOException, InterruptedException {
            arrayList.clear();
            for (ArrayListWritable<AnchorText> packet : values) {
                for (AnchorText data : packet) {
                    pushed = false;
                    for (int i = 0; i < arrayList.size(); ++i) {
                        if (!((AnchorText)arrayList.get(i)).equalsIgnoreSources(data)) continue;
                        ((AnchorText)arrayList.get(i)).addDocumentsFrom(data);
                        pushed = true;
                        break;
                    }
                    if (pushed) continue;
                    arrayList.add(data.clone());
                }
            }
            context.write((Object)key, arrayList);
        }
    }

    public static class Map
    extends Mapper<LongWritable, WebDocument, Text, ArrayListWritable<AnchorText>> {
        private static String base;
        private static String baseHost;
        private static int docno;
        private static final Text keyWord;
        private static final ArrayListWritable<AnchorText> arrayList;
        private static DocnoMapping docnoMapping;
        private static final Parser parser;
        private static final NodeFilter filter;
        private static NodeList list;
        private static boolean includeInternalLinks;
        private static AnchorTextNormalizer normalizer;

        public void setup(Mapper.Context context) throws IOException {
            Configuration conf = context.getConfiguration();
            String docnoMappingClass = conf.get("Cloud9.DocnoMappingClass");
            try {
                docnoMapping = (DocnoMapping)Class.forName(docnoMappingClass).newInstance();
            }
            catch (Exception e) {
                throw new RuntimeException("Error initializing DocnoMapping class!");
            }
            String docnoMappingFile = conf.get("Cloud9.DocnoMappingFile", null);
            if (docnoMappingFile != null) {
                Path docnoMappingPath = null;
                try {
                    Path[] localFiles = DistributedCache.getLocalCacheFiles((Configuration)conf);
                    docnoMappingPath = localFiles != null ? localFiles[0] : new Path(conf.get("Cloud9.DocnoMappingFile"));
                }
                catch (IOException e) {
                    throw new RuntimeException("Unable to find DocnoMappingFile!");
                }
                try {
                    docnoMapping.loadMapping(docnoMappingPath, (FileSystem)FileSystem.getLocal((Configuration)conf));
                }
                catch (Exception e) {
                    e.printStackTrace();
                    throw new RuntimeException("Error initializing DocnoMapping!");
                }
            }
            includeInternalLinks = conf.getBoolean("Cloud9.IncludeInternalLinks", false);
            try {
                normalizer = (AnchorTextNormalizer)Class.forName(conf.get("Cloud9.AnchorTextNormalizer")).newInstance();
            }
            catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing AnchorTextNormalizer");
            }
        }

        public void map(LongWritable key, WebDocument doc, Mapper.Context context) throws IOException, InterruptedException {
            context.getCounter((Enum)LinkCounter.INPUT_DOCS).increment(1L);
            try {
                docno = docnoMapping.getDocno(doc.getDocid());
            }
            catch (NullPointerException e) {
                context.getCounter((Enum)LinkCounter.INVALID_DOCNO).increment(1L);
                return;
            }
            try {
                String url = doc.getURL().split("\n")[0];
                LOG.info((Object)("URI: " + url));
                base = Map.normalizeURL(url);
            }
            catch (Exception e) {
                context.getCounter((Enum)LinkCounter.INVALID_URL).increment(1L);
                return;
            }
            if (base == null) {
                context.getCounter((Enum)LinkCounter.INVALID_URL).increment(1L);
                return;
            }
            arrayList.clear();
            arrayList.add(new AnchorText(AnchorTextConstants.Type.DOCNO_FIELD.val, "", docno));
            keyWord.set(base);
            context.write((Object)keyWord, arrayList);
            context.getCounter((Enum)LinkCounter.OUTPUT_DOCS).increment(1L);
            try {
                baseHost = new URI(base).getHost();
            }
            catch (Exception e) {
                context.getCounter((Enum)LinkCounter.INVALID_URL).increment(1L);
                return;
            }
            if (baseHost == null) {
                context.getCounter((Enum)LinkCounter.INVALID_URL).increment(1L);
                return;
            }
            try {
                parser.setInputHTML(doc.getContent());
                NodeList nl = parser.parse(null);
                BaseHrefTag baseTag = new BaseHrefTag();
                baseTag.setBaseUrl(base);
                nl.add((Node)baseTag);
                parser.setInputHTML(nl.toHtml());
                list = parser.extractAllNodesThatMatch(filter);
            }
            catch (ParserException e) {
                context.getCounter((Enum)LinkCounter.PARSER_FAILED).increment(1L);
                return;
            }
            catch (StackOverflowError e) {
                context.getCounter((Enum)LinkCounter.PARSER_FAILED).increment(1L);
                return;
            }
            for (int i = 0; i < list.size(); ++i) {
                LinkTag link = (LinkTag)list.elementAt(i);
                String anchor = link.getLinkText();
                String url = Map.normalizeURL(link.extractLink());
                if (url == null || url.equals(base)) continue;
                String host = null;
                try {
                    host = new URI(url).getHost();
                }
                catch (Exception e) {
                    continue;
                }
                if (host == null) continue;
                if (anchor == null) {
                    anchor = "";
                }
                anchor = normalizer.process(anchor);
                arrayList.clear();
                if (baseHost.equals(host)) {
                    if (!includeInternalLinks) continue;
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.INTERNAL_IN_LINK.val, anchor, docno));
                } else {
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.EXTERNAL_IN_LINK.val, anchor, docno));
                }
                try {
                    keyWord.set(url);
                    context.write((Object)keyWord, arrayList);
                    continue;
                }
                catch (UTFDataFormatException e) {
                    context.getCounter((Enum)LinkCounter.TEXT_TOO_LONG).increment(1L);
                    keyWord.set(url);
                    byte flag = ((AnchorText)arrayList.get(0)).getType();
                    arrayList.clear();
                    arrayList.add(new AnchorText(flag, "", docno));
                    context.write((Object)keyWord, arrayList);
                }
            }
        }

        private static String normalizeURL(String url) {
            try {
                String path;
                URI uri = new URI(url).normalize();
                String scheme = uri.getScheme().toLowerCase();
                String host = uri.getHost().toLowerCase();
                for (path = uri.getPath(); path != null && path.length() > 0 && path.charAt(path.length() - 1) == '/'; path = path.substring(0, path.length() - 1)) {
                }
                return new URI(scheme, host, path, null).toString();
            }
            catch (Exception e) {
                return null;
            }
        }

        static {
            keyWord = new Text();
            arrayList = new ArrayListWritable();
            docnoMapping = null;
            parser = new Parser();
            filter = new NodeClassFilter(LinkTag.class);
        }

        public static enum LinkCounter {
            INPUT_DOCS,
            OUTPUT_DOCS,
            INVALID_DOCNO,
            INVALID_URL,
            TEXT_TOO_LONG,
            PARSER_FAILED;

        }
    }
}

