Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.behemoth.tika; import com.digitalpebble.behemoth.BehemothConfiguration; import com.digitalpebble.behemoth.BehemothDocument; import com.digitalpebble.behemoth.BehemothReducer; import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; import org.apache.commons.cli2.builder.ArgumentBuilder; import org.apache.commons.cli2.builder.DefaultOptionBuilder; import org.apache.commons.cli2.builder.GroupBuilder; import org.apache.commons.cli2.commandline.Parser; import org.apache.commons.cli2.util.HelpFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.List; public class TikaDriver extends Configured implements Tool, TikaConstants { private transient static Logger log = LoggerFactory.getLogger(TikaDriver.class); public TikaDriver() { super(null); } public TikaDriver(Configuration conf) { super(conf); } public static void main(String args[]) throws Exception { int res = ToolRunner.run(BehemothConfiguration.create(), new TikaDriver(), args); System.exit(res); } public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); GroupBuilder gBuilder = new GroupBuilder().withName("Options:"); List<Option> options = new ArrayList<Option>(); Option inputOpt = buildOption("input", "i", "The input path", true, true, null); options.add(inputOpt); Option outOpt = buildOption("output", "o", "The output path", true, true, null); options.add(outOpt); Option tikaOpt = buildOption("tikaProcessor", "t", "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true, false, null); options.add(tikaOpt); Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, ""); options.add(mimeTypeOpt); for (Option opt : options) { gBuilder = gBuilder.withOption(opt); } Group group = gBuilder.create(); try { Parser parser = new Parser(); parser.setGroup(group); // TODO catch exceptions with parsing of opts CommandLine cmdLine = parser.parse(args); Path inputPath = new Path(cmdLine.getValue(inputOpt).toString()); Path outputPath = new Path(cmdLine.getValue(outOpt).toString()); String handlerName = null; if (cmdLine.hasOption(tikaOpt)) { handlerName = cmdLine.getValue(tikaOpt).toString(); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); if (cmdLine.hasOption(mimeTypeOpt)) { String mimeType = cmdLine.getValue(mimeTypeOpt).toString(); job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType); } if (handlerName != null && handlerName.equals("") == false) { job.set(TIKA_PROCESSOR_KEY, handlerName); } job.setJobName("Tika : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TikaMapper.class); boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("TikaDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error("Exception", e); return -1; // don't delete the output as some of it could be used // fs.delete(outputPath, true); } finally { } } catch (OptionException e) { log.error("OptionException", e.getMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setGroup(group); formatter.print(); return -1; } return 0; } // taken from Mahout AbstractJob private Option buildOption(String name, String shortName, String description, boolean hasArg, boolean required, String defaultValue) { DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description) .withRequired(required); if (shortName != null) { optBuilder.withShortName(shortName); } if (hasArg) { ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(1).withMaximum(1); if (defaultValue != null) { argBuilder = argBuilder.withDefault(defaultValue); } optBuilder.withArgument(argBuilder.create()); } return optBuilder.create(); } }