Java tutorial
/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package sa.edu.kaust.twitter.index; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.MapRunnable; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import sa.edu.kaust.twitter.index.data.TweetPostingsList; import edu.umd.cloud9.io.array.ArrayListWritable; @SuppressWarnings("deprecation") public class BuildPostingsForwardIndex extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(BuildPostingsForwardIndex.class); protected static enum Dictionary { Size }; private static class MyMapRunner implements MapRunnable<Text, TweetPostingsList, Text, Text> { private String mInputFile; private Text outputValue = new Text(); public void configure(JobConf job) { mInputFile = job.get("map.input.file"); // get the filename of the split } public void run(RecordReader<Text, TweetPostingsList> input, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { Text key = input.createKey(); TweetPostingsList value = input.createValue(); int fileNo = Integer.parseInt(mInputFile.substring(mInputFile.lastIndexOf("-") + 1)); // get file no long pos = input.getPos(); // get curent position while (input.next(key, value)) { outputValue.set(fileNo + "\t" + pos); output.collect(key, outputValue); reporter.incrCounter(Dictionary.Size, 1); pos = input.getPos(); } sLogger.info("last termid: " + key + "(" + fileNo + ", " + pos + ")"); } } public static final long BigNumber = 1000000000; private static class MyReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> { FSDataOutputStream mOut; //int mCount; int mCurDoc = 0; public void configure(JobConf job) { FileSystem fs; try { fs = FileSystem.get(job); } catch (Exception e) { throw new RuntimeException("Error opening the FileSystem!"); } //String indexPath = job.get("Ivory.IndexPath"); String forwardIndexPath = job.get("ForwardIndexPath"); //mCount = job.getInt("Count", -1); try { mOut = fs.create(new Path(forwardIndexPath), true); //mOut.writeInt(mCount); } catch (Exception e) { throw new RuntimeException("Error in creating files!"); } } public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String[] s = values.next().toString().split("\\s+"); //sLogger.info (key + ": " + s[0] + " " + s[1]); if (values.hasNext()) throw new RuntimeException("There shouldn't be more than one value, key=" + key); int fileNo = Integer.parseInt(s[0]); long filePos = Long.parseLong(s[1]); long pos = BigNumber * fileNo + filePos; mCurDoc++; mOut.writeUTF(key.toString()); mOut.writeLong(pos); } public void close() throws IOException { mOut.close(); /*if (mCurDoc != mCount) { throw new IOException("Expected " + mCount + " docs, actually got " + mCurDoc + " terms!"); }*/ } } private static int printUsage() { System.out.println("usage: [input] [output-dir]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } JobConf conf = new JobConf(BuildPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = 10; sLogger.info("Tool: PostingsForwardIndex"); String postingsPath = args[0]; String forwardIndexPath = args[1]; if (!fs.exists(new Path(postingsPath))) { sLogger.info("Error: IntDocVectors don't exist!"); return 0; } // delete the output directory if it exists already //FileSystem.get(conf).delete(new Path(forwardIndexPath), true); if (fs.exists(new Path(forwardIndexPath))) { sLogger.info("PostingsForwardIndex already exists: skipping!"); return 0; } conf.set("ForwardIndexPath", forwardIndexPath); conf.setJobName("BuildPostingsForwardIndex"); Path inputPath = new Path(postingsPath); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; } public static void RunBuildPostingForwardIndex(String input, String output) throws Exception { String[] args = new String[2]; args[0] = input; args[1] = output; new BuildPostingsForwardIndex().run(args); } public static void main(String[] args) throws Exception { new BuildPostingsForwardIndex().run(args); } }