Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.behemoth.util; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.digitalpebble.behemoth.BehemothConfiguration; import com.digitalpebble.behemoth.BehemothDocument; import com.digitalpebble.behemoth.DocumentFilter; /** * Utility class used to read the content of a Behemoth SequenceFile. **/ public class CorpusReader extends Configured implements Tool { public static void main(String[] args) throws Exception { int res = ToolRunner.run(BehemothConfiguration.create(), new CorpusReader(), args); System.exit(res); } public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("c", "displayContent", false, "display binary content in output"); options.addOption("t", "displayText", false, "display text in output"); options.addOption("a", "displayAnnotations", false, "display annotations in output"); options.addOption("m", "displayMetadata", false, "display metadata in output"); // parse the command line arguments CommandLine line = null; try { line = parser.parse(options, args); String input = line.getOptionValue("i"); if (line.hasOption("help")) { formatter.printHelp("CorpusReader", options); return 0; } if (input == null) { formatter.printHelp("CorpusReader", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusReader", options); return -1; } boolean showBinaryContent = line.hasOption("displayContent"); boolean showText = line.hasOption("displayText"); boolean showAnnotations = line.hasOption("displayAnnotations"); boolean showMD = line.hasOption("displayMetadata"); Path inputPath = new Path(line.getOptionValue("i")); Configuration conf = getConf(); FileSystem fs = inputPath.getFileSystem(conf); // filter input DocumentFilter filters = DocumentFilter.getFilters(conf); boolean doFilter = DocumentFilter.isRequired(conf); FileStatus[] fss = fs.listStatus(inputPath); for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Text key = new Text(); BehemothDocument value = new BehemothDocument(); while (reader.next(key, value)) { // skip this document? if (doFilter && filters.keep(value) == false) continue; System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD)); } reader.close(); } return 0; } }