/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.parser.html;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.DataURIScheme;
import org.apache.tika.parser.html.DataURISchemeParseException;
import org.apache.tika.parser.html.DataURISchemeUtil;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

class HtmlHandler
extends TextContentHandler {
    private static final Set<String> URI_ATTRIBUTES = new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
    private static final Pattern ICBM = Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
    private static final Map<String, Property> META_HEADER_MAPPINGS = new HashMap<String, Property>();
    private static final Attributes EMPTY_ATTS;
    private final HtmlMapper mapper;
    private final XHTMLContentHandler xhtml;
    private final Metadata metadata;
    private final ParseContext context;
    private final boolean extractScripts;
    private final StringBuilder title = new StringBuilder();
    private final DataURISchemeUtil dataURISchemeUtil = new DataURISchemeUtil();
    private final StringBuilder script = new StringBuilder();
    private int bodyLevel = 0;
    private int discardLevel = 0;
    private int titleLevel = 0;
    private int scriptLevel = 0;
    private Attributes scriptAtts = EMPTY_ATTS;
    private boolean isTitleSetToMetadata = false;

    private HtmlHandler(HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean extractScripts) {
        super(xhtml);
        String name;
        this.mapper = mapper;
        this.xhtml = xhtml;
        this.metadata = metadata;
        this.context = context;
        this.extractScripts = extractScripts;
        if (metadata.get("Content-Location") == null && (name = metadata.get("resourceName")) != null) {
            name = name.trim();
            try {
                new URL(name);
                metadata.set("Content-Location", name);
            }
            catch (MalformedURLException malformedURLException) {
                // empty catch block
            }
        }
    }

    public HtmlHandler(HtmlMapper mapper, ContentHandler handler, Metadata metadata, ParseContext context, boolean extractScripts) {
        this(mapper, new XHTMLContentHandler(handler, metadata), metadata, context, extractScripts);
    }

    @Deprecated
    public HtmlHandler(HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
        this(mapper, new XHTMLContentHandler(handler, metadata), metadata, new ParseContext(), false);
    }

    @Override
    public void startElement(String uri, String local, String name, Attributes atts) throws SAXException {
        String safe;
        if ("HTML".equals(name) && atts.getValue("lang") != null) {
            this.metadata.set("Content-Language", atts.getValue("lang"));
        }
        if ("SCRIPT".equals(name)) {
            ++this.scriptLevel;
        }
        if ("TITLE".equals(name) || this.titleLevel > 0) {
            ++this.titleLevel;
        }
        if ("BODY".equals(name) || "FRAMESET".equals(name) || this.bodyLevel > 0) {
            ++this.bodyLevel;
        }
        if (this.mapper.isDiscardElement(name) || this.discardLevel > 0) {
            ++this.discardLevel;
        }
        if (this.bodyLevel == 0 && this.discardLevel == 0) {
            if ("META".equals(name) && atts.getValue("content") != null) {
                if (atts.getValue("http-equiv") != null) {
                    this.addHtmlMetadata(atts.getValue("http-equiv"), atts.getValue("content"));
                } else if (atts.getValue("name") != null) {
                    this.addHtmlMetadata(atts.getValue("name"), atts.getValue("content"));
                } else if (atts.getValue("property") != null) {
                    this.metadata.add(atts.getValue("property"), atts.getValue("content"));
                }
            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
                this.startElementWithSafeAttributes("base", atts);
                this.xhtml.endElement("base");
                this.metadata.set("Content-Location", this.resolve(atts.getValue("href")));
            } else if ("LINK".equals(name)) {
                this.startElementWithSafeAttributes("link", atts);
                this.xhtml.endElement("link");
            } else if ("SCRIPT".equals(name)) {
                this.scriptAtts = atts;
            }
        }
        if (this.bodyLevel > 0 && this.discardLevel == 0 && (safe = this.mapper.mapSafeElement(name)) != null) {
            this.startElementWithSafeAttributes(safe, atts);
        }
        this.title.setLength(0);
        String value = atts.getValue("src");
        if (value != null && value.startsWith("data:") && (this.scriptLevel == 0 || this.extractScripts)) {
            this.handleDataURIScheme(value);
        }
    }

    private void addHtmlMetadata(String name, String value) {
        Property property;
        if (StringUtils.isBlank(name) || StringUtils.isBlank(value)) {
            return;
        }
        if (name.equalsIgnoreCase("ICBM")) {
            Matcher m3 = ICBM.matcher(value);
            if (m3.matches()) {
                this.metadata.set("ICBM", m3.group(1) + ", " + m3.group(2));
                this.metadata.set(Metadata.LATITUDE, m3.group(1));
                this.metadata.set(Metadata.LONGITUDE, m3.group(2));
            } else {
                this.metadata.set("ICBM", value);
            }
            return;
        }
        if (name.equalsIgnoreCase("Content-Type")) {
            MediaType type = MediaType.parse(value);
            if (type != null) {
                this.metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
            } else {
                this.metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
            }
            return;
        }
        String lcName = name.toLowerCase(Locale.US);
        if (!(!META_HEADER_MAPPINGS.containsKey(lcName) || (property = META_HEADER_MAPPINGS.get(lcName)).equals(TikaCoreProperties.TITLE) && this.isTitleSetToMetadata)) {
            if (property.isMultiValuePermitted()) {
                this.metadata.add(property, value);
            } else {
                this.metadata.set(property, value);
            }
        }
        this.metadata.add(name, value);
    }

    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
        if (atts.getLength() == 0) {
            this.xhtml.startElement(name);
            return;
        }
        boolean isObject = name.equals("object");
        String codebase = null;
        if (isObject) {
            codebase = atts.getValue("", "codebase");
            codebase = codebase != null ? this.resolve(codebase) : this.metadata.get("Content-Location");
        }
        AttributesImpl newAttributes = new AttributesImpl(atts);
        for (int att = 0; att < newAttributes.getLength(); ++att) {
            String attrName = newAttributes.getLocalName(att);
            String normAttrName = this.mapper.mapSafeAttribute(name, attrName);
            if (normAttrName == null) {
                newAttributes.removeAttribute(att);
                --att;
                continue;
            }
            newAttributes.setLocalName(att, normAttrName);
            if (URI_ATTRIBUTES.contains(normAttrName)) {
                String v;
                if (normAttrName.equals("src") && (v = newAttributes.getValue(att)).startsWith("data:")) {
                    newAttributes.setValue(att, "data:");
                }
                newAttributes.setValue(att, this.resolve(newAttributes.getValue(att)));
                continue;
            }
            if (isObject && "codebase".equals(normAttrName)) {
                newAttributes.setValue(att, codebase);
                continue;
            }
            if (!isObject || !"data".equals(normAttrName) && !"classid".equals(normAttrName)) continue;
            newAttributes.setValue(att, this.resolve(codebase, newAttributes.getValue(att)));
        }
        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
        }
        this.xhtml.startElement(name, newAttributes);
    }

    @Override
    public void endElement(String uri, String local, String name) throws SAXException {
        if ("SCRIPT".equals(name)) {
            --this.scriptLevel;
            if (this.scriptLevel == 0) {
                if (this.scriptAtts.getLength() > 0) {
                    this.startElementWithSafeAttributes("script", this.scriptAtts);
                    this.xhtml.endElement("script");
                }
                this.scriptAtts = EMPTY_ATTS;
                if (this.extractScripts) {
                    this.writeScript();
                }
            }
        }
        if (this.bodyLevel > 0 && this.discardLevel == 0) {
            String safe = this.mapper.mapSafeElement(name);
            if (safe != null) {
                this.xhtml.endElement(safe);
            } else if (XHTMLContentHandler.ENDLINE.contains(name.toLowerCase(Locale.ENGLISH))) {
                this.xhtml.newline();
            }
        }
        if (this.titleLevel > 0) {
            --this.titleLevel;
            if (this.titleLevel == 0 && !this.isTitleSetToMetadata) {
                this.metadata.set(TikaCoreProperties.TITLE, this.title.toString().trim());
                this.isTitleSetToMetadata = true;
            }
        }
        if (this.bodyLevel > 0) {
            --this.bodyLevel;
        }
        if (this.discardLevel > 0) {
            --this.discardLevel;
        }
    }

    private void handleDataURIScheme(String string) throws SAXException {
        EmbeddedDocumentExtractor embeddedDocumentExtractor;
        DataURIScheme dataURIScheme = null;
        try {
            dataURIScheme = this.dataURISchemeUtil.parse(string);
        }
        catch (DataURISchemeParseException e) {
            return;
        }
        Metadata m3 = new Metadata();
        m3.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
        if (dataURIScheme.getMediaType() != null) {
            m3.set("Content-Type", dataURIScheme.getMediaType().toString());
        }
        if ((embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(this.context)).shouldParseEmbedded(m3)) {
            try (InputStream stream = dataURIScheme.getInputStream();){
                embeddedDocumentExtractor.parseEmbedded(stream, this.xhtml, m3, true);
            }
            catch (IOException e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, this.metadata);
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void writeScript() throws SAXException {
        if (this.script.toString().trim().length() == 0) {
            return;
        }
        Metadata m3 = new Metadata();
        m3.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
        String src = this.scriptAtts.getValue("src");
        if (src != null) {
            m3.set(HTML.SCRIPT_SOURCE, src);
        }
        EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(this.context);
        List<DataURIScheme> dataURISchemes = this.dataURISchemeUtil.extract(this.script.toString());
        for (DataURIScheme dataURIScheme : dataURISchemes) {
            Metadata dataUriMetadata = new Metadata();
            dataUriMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
            dataUriMetadata.set("Content-Type", dataURIScheme.getMediaType().toString());
            if (!embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) continue;
            try {
                InputStream dataURISchemeInputStream = dataURIScheme.getInputStream();
                try {
                    embeddedDocumentExtractor.parseEmbedded(dataURISchemeInputStream, this.xhtml, dataUriMetadata, true);
                }
                finally {
                    if (dataURISchemeInputStream == null) continue;
                    dataURISchemeInputStream.close();
                }
            }
            catch (IOException iOException) {}
        }
        try (UnsynchronizedByteArrayInputStream stream = new UnsynchronizedByteArrayInputStream(this.script.toString().getBytes(StandardCharsets.UTF_8));){
            embeddedDocumentExtractor.parseEmbedded(stream, this.xhtml, m3, true);
        }
        catch (IOException iOException) {
        }
        finally {
            this.script.setLength(0);
        }
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        if (this.scriptLevel > 0 && this.extractScripts) {
            this.script.append(ch, start, length);
        }
        if (this.titleLevel > 0 && this.bodyLevel == 0) {
            this.title.append(ch, start, length);
        }
        if (this.bodyLevel > 0 && this.discardLevel == 0) {
            super.characters(ch, start, length);
        }
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
        if (this.bodyLevel > 0 && this.discardLevel == 0) {
            super.ignorableWhitespace(ch, start, length);
        }
    }

    private String resolve(String url) {
        return this.resolve(this.metadata.get("Content-Location"), url);
    }

    private String resolve(String base, String url) {
        url = url.trim();
        String lower = url.toLowerCase(Locale.ENGLISH);
        if (base == null || lower.startsWith("urn:") || lower.startsWith("mailto:") || lower.startsWith("tel:") || lower.startsWith("data:") || lower.startsWith("javascript:") || lower.startsWith("about:")) {
            return url;
        }
        try {
            URL baseURL = new URL(base.trim());
            String path = baseURL.getPath();
            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
                return new URL(baseURL.getProtocol(), baseURL.getHost(), baseURL.getPort(), baseURL.getPath() + url).toExternalForm();
            }
            return new URL(baseURL, url).toExternalForm();
        }
        catch (MalformedURLException e) {
            return url;
        }
    }

    static {
        META_HEADER_MAPPINGS.put("author", TikaCoreProperties.CREATOR);
        META_HEADER_MAPPINGS.put("title", TikaCoreProperties.TITLE);
        META_HEADER_MAPPINGS.put("subject", TikaCoreProperties.SUBJECT);
        META_HEADER_MAPPINGS.put("keywords", Office.KEYWORDS);
        META_HEADER_MAPPINGS.put("description", TikaCoreProperties.DESCRIPTION);
        EMPTY_ATTS = new AttributesImpl();
    }
}

