Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.examples; import java.io.IOException; import java.net.URI; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.tez.client.TezClient; import org.apache.tez.common.counters.TezCounter; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.Edge; import org.apache.tez.dag.api.InputDescriptor; import org.apache.tez.dag.api.ProcessorDescriptor; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.TezException; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.api.client.DAGClient; import org.apache.tez.dag.api.client.DAGStatus; import org.apache.tez.dag.api.client.StatusGetOpts; import org.apache.tez.mapreduce.common.MRInputAMSplitGenerator; import org.apache.tez.mapreduce.examples.IntersectExample.ForwardingProcessor; import org.apache.tez.mapreduce.hadoop.MRHelpers; import org.apache.tez.mapreduce.input.MRInput; import org.apache.tez.runtime.api.LogicalInput; import org.apache.tez.runtime.api.Reader; import org.apache.tez.runtime.library.api.KeyValuesReader; import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfigurer; import org.apache.tez.runtime.library.partitioner.HashPartitioner; import org.apache.tez.runtime.library.processor.SimpleProcessor; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; public class IntersectValidate extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(IntersectExample.class); private static final String LHS_INPUT_NAME = "lhsfile"; private static final String RHS_INPUT_NAME = "rhsfile"; private static final String COUNTER_GROUP_NAME = "INTERSECT_VALIDATE"; private static final String MISSING_KEY_COUNTER_NAME = "MISSING_KEY_EXISTS"; public static void main(String[] args) throws Exception { IntersectValidate validate = new IntersectValidate(); int status = ToolRunner.run(new Configuration(), validate, args); System.exit(status); } private static void printUsage() { System.err.println("Usage: " + "intersectvalidate <path1> <path2>"); ToolRunner.printGenericCommandUsage(System.err); } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); int result = validateArgs(otherArgs); if (result != 0) { return result; } return execute(otherArgs); } public int run(Configuration conf, String[] args, TezClient tezSession) throws Exception { setConf(conf); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); int result = validateArgs(otherArgs); if (result != 0) { return result; } return execute(otherArgs, tezSession); } private int validateArgs(String[] otherArgs) { if (otherArgs.length != 3 && otherArgs.length != 2) { printUsage(); return 2; } return 0; } private int execute(String[] args) throws TezException, IOException, InterruptedException { TezConfiguration tezConf = new TezConfiguration(getConf()); TezClient tezSession = null; try { tezSession = createTezSession(tezConf); return execute(args, tezConf, tezSession); } finally { if (tezSession != null) { tezSession.stop(); } } } private int execute(String[] args, TezClient tezSession) throws IOException, TezException, InterruptedException { TezConfiguration tezConf = new TezConfiguration(getConf()); return execute(args, tezConf, tezSession); } private TezClient createTezSession(TezConfiguration tezConf) throws TezException, IOException { TezClient tezSession = new TezClient("IntersectValidateSession", tezConf); tezSession.start(); return tezSession; } private int execute(String[] args, TezConfiguration tezConf, TezClient tezSession) throws IOException, TezException, InterruptedException { LOG.info("Running IntersectValidate"); UserGroupInformation.setConfiguration(tezConf); String lhsDir = args[0]; String rhsDir = args[1]; int numPartitions = 1; if (args.length == 3) { numPartitions = Integer.parseInt(args[2]); } if (numPartitions <= 0) { System.err.println("NumPartitions must be > 0"); return 4; } Path lhsPath = new Path(lhsDir); Path rhsPath = new Path(rhsDir); DAG dag = createDag(tezConf, lhsPath, rhsPath, numPartitions); setupURIsForCredentials(dag, lhsPath, rhsPath); tezSession.waitTillReady(); DAGClient dagClient = tezSession.submitDAG(dag); DAGStatus dagStatus = dagClient.waitForCompletionWithAllStatusUpdates(null); if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) { LOG.info("DAG diagnostics: " + dagStatus.getDiagnostics()); return -1; } else { dagStatus = dagClient.getDAGStatus(Sets.newHashSet(StatusGetOpts.GET_COUNTERS)); TezCounter counter = dagStatus.getDAGCounters().findCounter(COUNTER_GROUP_NAME, MISSING_KEY_COUNTER_NAME); if (counter == null) { LOG.info("Unable to determing equality"); return -2; } else { if (counter.getValue() != 0) { LOG.info("Validate failed. The two sides are not equivalent"); return -3; } else { LOG.info("Vlidation successful. The two sides are equivalent"); return 0; } } } } private DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions) throws IOException { DAG dag = new DAG("IntersectValidate"); // Configuration for src1 Configuration lhsInputConf = new Configuration(tezConf); lhsInputConf.set(FileInputFormat.INPUT_DIR, lhs.toUri().toString()); byte[] streamInputPayload = MRInput.createUserPayload(lhsInputConf, TextInputFormat.class.getName(), true, false); // Configuration for src2 Configuration rhsInputConf = new Configuration(tezConf); rhsInputConf.set(FileInputFormat.INPUT_DIR, rhs.toUri().toString()); byte[] hashInputPayload = MRInput.createUserPayload(rhsInputConf, TextInputFormat.class.getName(), true, false); // Configuration for intermediate output - shared by Vertex1 and Vertex2 // This should only be setting selective keys from the underlying conf. Fix after there's a // better mechanism to configure the IOs. OrderedPartitionedKVEdgeConfigurer edgeConf = OrderedPartitionedKVEdgeConfigurer.newBuilder( Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName(), null).build(); // Change the way resources are setup - no MRHelpers Vertex lhsVertex = new Vertex(LHS_INPUT_NAME, new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf)).addInput("lhs", new InputDescriptor(MRInput.class.getName()).setUserPayload(streamInputPayload), MRInputAMSplitGenerator.class); Vertex rhsVertex = new Vertex(RHS_INPUT_NAME, new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf)).addInput("rhs", new InputDescriptor(MRInput.class.getName()).setUserPayload(hashInputPayload), MRInputAMSplitGenerator.class); Vertex intersectValidateVertex = new Vertex("intersectvalidate", new ProcessorDescriptor(IntersectValidateProcessor.class.getName()), numPartitions, MRHelpers.getReduceResource(tezConf)); Edge e1 = new Edge(lhsVertex, intersectValidateVertex, edgeConf.createDefaultEdgeProperty()); Edge e2 = new Edge(rhsVertex, intersectValidateVertex, edgeConf.createDefaultEdgeProperty()); dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(intersectValidateVertex).addEdge(e1).addEdge(e2); return dag; } public static class IntersectValidateProcessor extends SimpleProcessor { private static final Log LOG = LogFactory.getLog(IntersectValidateProcessor.class); @Override public void run() throws Exception { Preconditions.checkState(getInputs().size() == 2); Preconditions.checkState(getOutputs().size() == 0); LogicalInput lhsInput = getInputs().get(LHS_INPUT_NAME); LogicalInput rhsInput = getInputs().get(RHS_INPUT_NAME); Reader lhsReaderRaw = lhsInput.getReader(); Reader rhsReaderRaw = rhsInput.getReader(); Preconditions.checkState(lhsReaderRaw instanceof KeyValuesReader); Preconditions.checkState(rhsReaderRaw instanceof KeyValuesReader); KeyValuesReader lhsReader = (KeyValuesReader) lhsReaderRaw; KeyValuesReader rhsReader = (KeyValuesReader) rhsReaderRaw; TezCounter lhsMissingKeyCounter = getContext().getCounters().findCounter(COUNTER_GROUP_NAME, MISSING_KEY_COUNTER_NAME); while (lhsReader.next()) { if (rhsReader.next()) { if (!lhsReader.getCurrentKey().equals(rhsReader.getCurrentKey())) { LOG.info("MismatchedKeys: " + "lhs=" + lhsReader.getCurrentKey() + ", rhs=" + rhsReader.getCurrentKey()); lhsMissingKeyCounter.increment(1); } } else { lhsMissingKeyCounter.increment(1); LOG.info("ExtraKey in lhs: " + lhsReader.getClass()); break; } } if (rhsReader.next()) { lhsMissingKeyCounter.increment(1); LOG.info("ExtraKey in rhs: " + lhsReader.getClass()); } } } private void setupURIsForCredentials(DAG dag, Path... paths) throws IOException { List<URI> uris = new LinkedList<URI>(); for (Path path : paths) { FileSystem fs = path.getFileSystem(getConf()); Path qPath = fs.makeQualified(path); uris.add(qPath.toUri()); } dag.addURIsForCredentials(uris); } }