/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse.tika;

import de.l3s.boilerpipe.BoilerpipeExtractor;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilters;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.tika.BoilerpipeExtractorRepository;
import org.apache.nutch.parse.tika.DOMBuilder;
import org.apache.nutch.parse.tika.DOMContentUtils;
import org.apache.nutch.parse.tika.HTMLMetaProcessor;
import org.apache.nutch.protocol.Content;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.ContentHandler;

public class TikaParser
implements Parser {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private Configuration conf;
    private TikaConfig tikaConfig = null;
    private DOMContentUtils utils;
    private HtmlParseFilters htmlParseFilters;
    private String cachingPolicy;
    private HtmlMapper HTMLMapper;
    private boolean parseEmbedded = true;
    private boolean upperCaseElementNames = true;
    private boolean useBoilerpipe;
    private String boilerpipeExtractorName;
    private String[] boilerpipeMimeTypes;

    public ParseResult getParse(Content content) {
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment root = doc.createDocumentFragment();
        return this.getParse(content, doc, root);
    }

    /*
     * WARNING - void declaration
     */
    ParseResult getParse(Content content, HTMLDocumentImpl doc, DocumentFragment root) {
        String[] TikaMDNames;
        TeeContentHandler contentHandler;
        URL base;
        String mimeType = content.getContentType();
        try {
            base = new URL(content.getBaseUrl());
        }
        catch (MalformedURLException e) {
            return new ParseStatus((Throwable)e).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        CompositeParser compositeParser = (CompositeParser)this.tikaConfig.getParser();
        org.apache.tika.parser.Parser parser = (org.apache.tika.parser.Parser)compositeParser.getParsers().get(MediaType.parse((String)mimeType));
        if (parser == null) {
            String message = "Can't retrieve Tika parser for mime-type " + mimeType;
            LOG.error(message);
            return new ParseStatus(2, message).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        LOG.debug("Using Tika parser {} for mime-type {}.", (Object)parser.getClass().getName(), (Object)mimeType);
        byte[] raw = content.getContent();
        org.apache.tika.metadata.Metadata tikamd = new org.apache.tika.metadata.Metadata();
        DOMBuilder domBuilder = new DOMBuilder((Document)doc, root);
        domBuilder.setUpperCaseElementNames(this.upperCaseElementNames);
        domBuilder.setDefaultNamespaceURI("http://www.w3.org/1999/xhtml");
        DOMBuilder domHandler = domBuilder;
        LinkContentHandler linkContentHandler = new LinkContentHandler();
        ParseContext context = new ParseContext();
        if (this.parseEmbedded) {
            context.set(org.apache.tika.parser.Parser.class, (Object)new AutoDetectParser(this.tikaConfig));
        }
        if (this.useBoilerpipe && this.boilerpipeExtractorName != null && Arrays.asList(this.boilerpipeMimeTypes).contains(mimeType)) {
            BoilerpipeExtractor extractor = BoilerpipeExtractorRepository.getExtractor(this.boilerpipeExtractorName);
            BoilerpipeContentHandler boilerpipeHandler = new BoilerpipeContentHandler((ContentHandler)domHandler, extractor);
            contentHandler = new TeeContentHandler(new ContentHandler[]{boilerpipeHandler, linkContentHandler});
        } else {
            contentHandler = new TeeContentHandler(new ContentHandler[]{domHandler, linkContentHandler});
        }
        if (this.HTMLMapper != null) {
            context.set(HtmlMapper.class, (Object)this.HTMLMapper);
        }
        tikamd.set("Content-Type", mimeType);
        try {
            parser.parse((InputStream)new ByteArrayInputStream(raw), (ContentHandler)contentHandler, tikamd, context);
        }
        catch (Exception e) {
            LOG.error("Error parsing {}", (Object)content.getUrl(), (Object)e);
            return new ParseStatus(2, e.getMessage()).getEmptyParseResult(content.getUrl(), this.getConf());
        }
        HTMLMetaTags metaTags = new HTMLMetaTags();
        String text = "";
        String title = "";
        Outlink[] outlinks = new Outlink[]{};
        Metadata nutchMetadata = new Metadata();
        HTMLMetaProcessor.getMetaTags(metaTags, root, base);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Meta tags for " + String.valueOf(base) + ": " + metaTags.toString());
        }
        if (!metaTags.getNoIndex()) {
            StringBuffer sb = new StringBuffer();
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting text...");
            }
            this.utils.getText(sb, root);
            text = sb.toString();
            sb.setLength(0);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting title...");
            }
            this.utils.getTitle(sb, root);
            title = sb.toString().trim();
        }
        if (!metaTags.getNoFollow()) {
            void var21_26;
            ArrayList<Outlink> l = new ArrayList<Outlink>();
            URL uRL = base;
            String baseTagHref = tikamd.get("Content-Location");
            if (baseTagHref != null) {
                try {
                    URL uRL2 = new URL(base, baseTagHref);
                }
                catch (MalformedURLException e) {
                    LOG.trace("Invalid <base href=\"{}\">", (Object)baseTagHref);
                }
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting links (base URL = {}) ...", (Object)var21_26);
            }
            List tikaExtractedOutlinks = linkContentHandler.getLinks();
            this.utils.getOutlinks((URL)var21_26, l, tikaExtractedOutlinks);
            outlinks = l.toArray(new Outlink[l.size()]);
            if (LOG.isTraceEnabled()) {
                LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
            }
        }
        for (String tikaMDName : TikaMDNames = tikamd.names()) {
            String[] values;
            if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString())) continue;
            for (String v : values = tikamd.getValues(tikaMDName)) {
                nutchMetadata.add(tikaMDName, v);
                if (!tikaMDName.equalsIgnoreCase("robots") || nutchMetadata.get("robots") != null) continue;
                nutchMetadata.add("robots", v);
            }
        }
        if (outlinks.length == 0) {
            outlinks = OutlinkExtractor.getOutlinks((String)text, (Configuration)this.getConf());
        }
        ParseStatus parseStatus = new ParseStatus(1);
        if (metaTags.getRefresh()) {
            parseStatus.setMinorCode((short)100);
            parseStatus.setArgs(new String[]{metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())});
        }
        ParseData parseData = new ParseData(parseStatus, title, outlinks, content.getMetadata(), nutchMetadata);
        ParseResult parseResult = ParseResult.createParseResult((String)content.getUrl(), (Parse)new ParseImpl(text, parseData));
        ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
        if (metaTags.getNoCache()) {
            for (Map.Entry entry : filteredParse) {
                ((Parse)entry.getValue()).getData().getParseMeta().set("caching.forbidden", this.cachingPolicy);
            }
        }
        return filteredParse;
    }

    public void setConf(Configuration conf) {
        String htmlmapperClassName;
        Object message;
        this.conf = conf;
        this.tikaConfig = null;
        String customConfFile = conf.get("tika.config.file");
        if (customConfFile != null) {
            try {
                URL customTikaConfig = conf.getResource(customConfFile);
                if (customTikaConfig != null) {
                    this.tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
                }
            }
            catch (Exception e1) {
                message = "Problem loading custom Tika configuration from " + customConfFile;
                LOG.error((String)message, (Throwable)e1);
            }
        }
        if (this.tikaConfig == null) {
            try {
                this.tikaConfig = new TikaConfig(this.getClass().getClassLoader());
            }
            catch (Exception e2) {
                message = "Problem loading default Tika configuration";
                LOG.error((String)message, (Throwable)e2);
            }
        }
        if (StringUtils.isNotBlank((CharSequence)(htmlmapperClassName = conf.get("tika.htmlmapper.classname")))) {
            try {
                Class<?> HTMLMapperClass = Class.forName(htmlmapperClassName);
                boolean interfaceOK = HtmlMapper.class.isAssignableFrom(HTMLMapperClass);
                if (!interfaceOK) {
                    throw new RuntimeException("Class " + htmlmapperClassName + " does not implement HtmlMapper");
                }
                this.HTMLMapper = (HtmlMapper)HTMLMapperClass.getConstructor(new Class[0]).newInstance(new Object[0]);
            }
            catch (Exception e) {
                String message2 = "Can't generate instance for class " + htmlmapperClassName;
                LOG.error(message2);
                throw new RuntimeException(message2);
            }
        }
        this.htmlParseFilters = new HtmlParseFilters(conf);
        this.utils = new DOMContentUtils(conf);
        this.cachingPolicy = conf.get("parser.caching.forbidden.policy", "content");
        this.upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names", true);
        this.parseEmbedded = conf.getBoolean("tika.parse.embedded", true);
        this.useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe");
        this.boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
        this.boilerpipeMimeTypes = conf.getStrings("tika.extractor.boilerpipe.mime.types", new String[]{"text/html", "application/xhtml+xml"});
    }

    public Configuration getConf() {
        return this.conf;
    }
}

