Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.examples; import java.io.IOException; import java.net.URI; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.tez.client.TezClient; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.Edge; import org.apache.tez.dag.api.InputDescriptor; import org.apache.tez.dag.api.OutputDescriptor; import org.apache.tez.dag.api.ProcessorDescriptor; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.TezException; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.api.client.DAGClient; import org.apache.tez.dag.api.client.DAGStatus; import org.apache.tez.mapreduce.committer.MROutputCommitter; import org.apache.tez.mapreduce.common.MRInputAMSplitGenerator; import org.apache.tez.mapreduce.hadoop.MRHelpers; import org.apache.tez.mapreduce.input.MRInput; import org.apache.tez.mapreduce.output.MROutput; import org.apache.tez.runtime.api.LogicalInput; import org.apache.tez.runtime.api.LogicalOutput; import org.apache.tez.runtime.api.Reader; import org.apache.tez.runtime.library.api.KeyValueReader; import org.apache.tez.runtime.library.api.KeyValueWriter; import org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfigurer; import org.apache.tez.runtime.library.partitioner.HashPartitioner; import org.apache.tez.runtime.library.processor.SimpleProcessor; import com.google.common.base.Preconditions; public class IntersectExample extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(IntersectExample.class); public static void main(String[] args) throws Exception { IntersectExample intersect = new IntersectExample(); int status = ToolRunner.run(new Configuration(), intersect, args); System.exit(status); } private static void printUsage() { System.err.println("Usage: " + "intersect <file1> <file2> <numPartitions> <outPath>"); ToolRunner.printGenericCommandUsage(System.err); } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); int result = validateArgs(otherArgs); if (result != 0) { return result; } return execute(otherArgs); } public int run(Configuration conf, String[] args, TezClient tezSession) throws Exception { setConf(conf); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); int result = validateArgs(otherArgs); if (result != 0) { return result; } return execute(otherArgs, tezSession); } private int validateArgs(String[] otherArgs) { if (otherArgs.length != 4) { printUsage(); return 2; } return 0; } private int execute(String[] args) throws TezException, IOException, InterruptedException { TezConfiguration tezConf = new TezConfiguration(getConf()); TezClient tezSession = null; try { tezSession = createTezSession(tezConf); return execute(args, tezConf, tezSession); } finally { if (tezSession != null) { tezSession.stop(); } } } private int execute(String[] args, TezClient tezSession) throws IOException, TezException, InterruptedException { TezConfiguration tezConf = new TezConfiguration(getConf()); return execute(args, tezConf, tezSession); } private TezClient createTezSession(TezConfiguration tezConf) throws TezException, IOException { TezClient tezSession = new TezClient("IntersectExampleSession", tezConf); tezSession.start(); return tezSession; } private int execute(String[] args, TezConfiguration tezConf, TezClient tezSession) throws IOException, TezException, InterruptedException { LOG.info("Running IntersectExample"); UserGroupInformation.setConfiguration(tezConf); String streamInputDir = args[0]; String hashInputDir = args[1]; int numPartitions = Integer.parseInt(args[2]); String outputDir = args[3]; Path streamInputPath = new Path(streamInputDir); Path hashInputPath = new Path(hashInputDir); Path outputPath = new Path(outputDir); // Verify output path existence FileSystem fs = FileSystem.get(tezConf); if (fs.exists(outputPath)) { System.err.println("Output directory: " + outputDir + " already exists"); return 3; } if (numPartitions <= 0) { System.err.println("NumPartitions must be > 0"); return 4; } DAG dag = createDag(tezConf, streamInputPath, hashInputPath, outputPath, numPartitions); setupURIsForCredentials(dag, streamInputPath, hashInputPath, outputPath); tezSession.waitTillReady(); DAGClient dagClient = tezSession.submitDAG(dag); DAGStatus dagStatus = dagClient.waitForCompletionWithAllStatusUpdates(null); if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) { LOG.info("DAG diagnostics: " + dagStatus.getDiagnostics()); return -1; } return 0; } private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions) throws IOException { DAG dag = new DAG("IntersectExample"); // Configuration for src1 Configuration streamInputConf = new Configuration(tezConf); streamInputConf.set(FileInputFormat.INPUT_DIR, streamPath.toUri().toString()); byte[] streamInputPayload = MRInput.createUserPayload(streamInputConf, TextInputFormat.class.getName(), true, false); // Configuration for src2 Configuration hashInputConf = new Configuration(tezConf); hashInputConf.set(FileInputFormat.INPUT_DIR, hashPath.toUri().toString()); byte[] hashInputPayload = MRInput.createUserPayload(hashInputConf, TextInputFormat.class.getName(), true, false); // Configuration for intermediate output - shared by Vertex1 and Vertex2 // This should only be setting selective keys from the underlying conf. Fix after there's a // better mechanism to configure the IOs. UnorderedPartitionedKVEdgeConfigurer edgeConf = UnorderedPartitionedKVEdgeConfigurer.newBuilder( Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName(), null).build(); Configuration finalOutputConf = new Configuration(tezConf); finalOutputConf.set(FileOutputFormat.OUTDIR, outPath.toUri().toString()); byte[] finalOutputPayload = MROutput.createUserPayload(finalOutputConf, TextOutputFormat.class.getName(), true); // Change the way resources are setup - no MRHelpers Vertex streamFileVertex = new Vertex("partitioner1", new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf)) .addInput("streamfile", new InputDescriptor(MRInput.class.getName()).setUserPayload(streamInputPayload), MRInputAMSplitGenerator.class); Vertex hashFileVertex = new Vertex("partitioner2", new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf)) .addInput("hashfile", new InputDescriptor(MRInput.class.getName()).setUserPayload(hashInputPayload), MRInputAMSplitGenerator.class); Vertex intersectVertex = new Vertex("intersect", new ProcessorDescriptor(IntersectProcessor.class.getName()), numPartitions, MRHelpers.getReduceResource(tezConf)).addOutput("finalOutput", new OutputDescriptor(MROutput.class.getName()).setUserPayload(finalOutputPayload), MROutputCommitter.class); Edge e1 = new Edge(streamFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty()); Edge e2 = new Edge(hashFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty()); dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(intersectVertex).addEdge(e1) .addEdge(e2); return dag; } private void setupURIsForCredentials(DAG dag, Path... paths) throws IOException { List<URI> uris = new LinkedList<URI>(); for (Path path : paths) { FileSystem fs = path.getFileSystem(getConf()); Path qPath = fs.makeQualified(path); uris.add(qPath.toUri()); } dag.addURIsForCredentials(uris); } // private void obtainTokens(Credentials credentials, Path... paths) throws IOException { // TokenCache.obtainTokensForNamenodes(credentials, paths, getConf()); // } /** * Reads key-values from the source and forwards the value as the key for the output */ public static class ForwardingProcessor extends SimpleProcessor { @Override public void run() throws Exception { Preconditions.checkState(getInputs().size() == 1); Preconditions.checkState(getOutputs().size() == 1); LogicalInput input = getInputs().values().iterator().next(); Reader rawReader = input.getReader(); Preconditions.checkState(rawReader instanceof KeyValueReader); LogicalOutput output = getOutputs().values().iterator().next(); KeyValueReader reader = (KeyValueReader) rawReader; KeyValueWriter writer = (KeyValueWriter) output.getWriter(); while (reader.next()) { Object val = reader.getCurrentValue(); writer.write(val, NullWritable.get()); } } } public static class IntersectProcessor extends SimpleProcessor { @Override public void run() throws Exception { Preconditions.checkState(getInputs().size() == 2); Preconditions.checkState(getOutputs().size() == 1); LogicalInput streamInput = getInputs().get("partitioner1"); LogicalInput hashInput = getInputs().get("partitioner2"); Reader rawStreamReader = streamInput.getReader(); Reader rawHashReader = hashInput.getReader(); Preconditions.checkState(rawStreamReader instanceof KeyValueReader); Preconditions.checkState(rawHashReader instanceof KeyValueReader); LogicalOutput lo = getOutputs().values().iterator().next(); Preconditions.checkState(lo instanceof MROutput); MROutput output = (MROutput) lo; KeyValueWriter writer = output.getWriter(); KeyValueReader hashKvReader = (KeyValueReader) rawHashReader; Set<Text> keySet = new HashSet<Text>(); while (hashKvReader.next()) { keySet.add(new Text((Text) hashKvReader.getCurrentKey())); } KeyValueReader streamKvReader = (KeyValueReader) rawStreamReader; while (streamKvReader.next()) { Text key = (Text) streamKvReader.getCurrentKey(); if (keySet.contains(key)) { writer.write(key, NullWritable.get()); } } LOG.info("Completed Processing. Trying to commit"); while (!getContext().canCommit()) { Thread.sleep(100l); } output.commit(); } } }