com.digitalpebble.behemoth.tika.TikaMapper.java Source code

Introduction

Here is the source code for com.digitalpebble.behemoth.tika.TikaMapper.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.behemoth.tika;

import com.digitalpebble.behemoth.BehemothDocument;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

/**
 * Uses a {@link com.digitalpebble.behemoth.tika.TikaProcessor} to extract text
 * using Tika. Users wanting to override the default work of the TikaProcessor
 * can set the "tika.processor" value in the JobConf and give it a fully
 * qualified class name. The implementation must extend TikaProcessor and it
 * must have a zero arg. constructor.
 */
public class TikaMapper extends MapReduceBase implements Mapper<Text, BehemothDocument, Text, BehemothDocument> {
    private static final Logger LOG = LoggerFactory.getLogger(TikaMapper.class);

    protected TikaProcessor processor;

    @Override
    public void map(Text text, BehemothDocument inputDoc, OutputCollector<Text, BehemothDocument> outputCollector,
            Reporter reporter) throws IOException {

        BehemothDocument[] documents = processor.process(inputDoc, reporter);
        if (documents != null) {
            for (int i = 0; i < documents.length; i++) {
                try {
                    outputCollector.collect(text, documents[i]);
                } catch (Error e) {
                    LOG.error("Error with writing doc", inputDoc.getUrl());
                }
            }
        }
    }

    @Override
    public void configure(JobConf job) {

        String handlerName = job.get(TikaConstants.TIKA_PROCESSOR_KEY);
        if (handlerName != null) {
            Class handlerClass = job.getClass(handlerName, TikaProcessor.class);
            try {
                processor = (TikaProcessor) handlerClass.newInstance();
            } catch (InstantiationException e) {
                LOG.error("Exception", e);
                // TODO: what's the best way to do this?
                throw new RuntimeException(e);
            } catch (IllegalAccessException e) {
                LOG.error("Exception", e);
                throw new RuntimeException(e);
            }
        } else {
            processor = new TikaProcessor();
        }
        processor.setConf(job);
    }
}