org.apache.nutch.parse.ParseUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.parse.ParseUtil.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse;

// Commons Logging imports

import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.protocol.Content;

/**
 * A Utility class containing methods to simply perform parsing utilities such
 * as iterating through a preferred list of {@link Parser}s to obtain
 * {@link Parse} objects.
 *
 * @author mattmann
 * @author Jérôme Charron
 * @author Sébastien Le Callonnec
 */
public class ParseUtil {

    /* our log stream */
    public static final Log LOG = LogFactory.getLog(ParseUtil.class);
    private ParserFactory parserFactory;
    /** Parser timeout set to 30 sec by default. Set -1 to deactivate **/
    private int MAX_PARSE_TIME = 30;

    /**
     * 
     * @param conf
     */
    public ParseUtil(Configuration conf) {
        this.parserFactory = new ParserFactory(conf);
        MAX_PARSE_TIME = conf.getInt("parser.timeout", 30);
    }

    /**
     * Performs a parse by iterating through a List of preferred {@link Parser}s
     * until a successful parse is performed and a {@link Parse} object is
     * returned. If the parse is unsuccessful, a message is logged to the
     * <code>WARNING</code> level, and an empty parse is returned.
     *
     * @param content The content to try and parse.
     * @return &lt;key, {@link Parse}&gt; pairs.
     * @throws ParseException If no suitable parser is found to perform the parse.
     */
    public ParseResult parse(Content content) throws ParseException {
        Parser[] parsers = null;

        try {
            parsers = this.parserFactory.getParsers(content.getContentType(),
                    content.getUrl() != null ? content.getUrl() : "");
        } catch (ParserNotFound e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() + " of type "
                        + content.getContentType());
            }
            throw new ParseException(e.getMessage());
        }

        ParseResult parseResult = null;
        for (int i = 0; i < parsers.length; i++) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
            }
            if (MAX_PARSE_TIME != -1)
                parseResult = runParser(parsers[i], content);
            else
                parseResult = parsers[i].getParse(content);

            if (parseResult != null && !parseResult.isEmpty())
                return parseResult;
        }

        if (LOG.isWarnEnabled()) {
            LOG.warn("Unable to successfully parse content " + content.getUrl() + " of type "
                    + content.getContentType());
        }

        return new ParseStatus(new ParseException("Unable to successfully parse content"))
                .getEmptyParseResult(content.getUrl(), null);
    }

    /**
     * Method parses a {@link Content} object using the {@link Parser} specified
     * by the parameter <code>extId</code>, i.e., the Parser's extension ID.
     * If a suitable {@link Parser} is not found, then a <code>WARNING</code>
     * level message is logged, and a ParseException is thrown. If the parse is
     * uncessful for any other reason, then a <code>WARNING</code> level
     * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
     * returned.
     *
     * @param extId The extension implementation ID of the {@link Parser} to use
     *              to parse the specified content.
     * @param content The content to parse.
     *
     * @return &lt;key, {@link Parse}&gt; pairs if the parse is successful, otherwise,
     *         a single &lt;key, <code>ParseStatus.getEmptyParse()</code>&gt; pair.
     *
     * @throws ParseException If there is no suitable {@link Parser} found
     *                        to perform the parse.
     */
    public ParseResult parseByExtensionId(String extId, Content content) throws ParseException {
        Parser p = null;

        try {
            p = this.parserFactory.getParserById(extId);
        } catch (ParserNotFound e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() + " of type "
                        + content.getContentType());
            }
            throw new ParseException(e.getMessage());
        }

        ParseResult parseResult = null;
        if (MAX_PARSE_TIME != -1)
            parseResult = runParser(p, content);
        else
            parseResult = p.getParse(content);
        if (parseResult != null && !parseResult.isEmpty()) {
            return parseResult;
        } else {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Unable to successfully parse content " + content.getUrl() + " of type "
                        + content.getContentType());
            }
            return new ParseStatus(new ParseException("Unable to successfully parse content"))
                    .getEmptyParseResult(content.getUrl(), null);
        }
    }

    private ParseResult runParser(Parser p, Content content) {
        ParseCallable pc = new ParseCallable(p, content);
        FutureTask<ParseResult> task = new FutureTask<ParseResult>(pc);
        ParseResult res = null;
        Thread t = new Thread(task);
        t.start();
        try {
            res = task.get(MAX_PARSE_TIME, TimeUnit.SECONDS);
        } catch (TimeoutException e) {
            LOG.warn("TIMEOUT parsing " + content.getUrl() + " with " + p);
        } catch (Exception e) {
            task.cancel(true);
            res = null;
            t.interrupt();
        } finally {
            t = null;
            pc = null;
        }
        return res;
    }

}