List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat OUTDIR
String OUTDIR
To view the source code for org.apache.hadoop.mapreduce.lib.output FileOutputFormat OUTDIR.
Click Source Link
From source file:org.apache.tajo.storage.hbase.HFileAppender.java
License:Apache License
@Override public void init() throws IOException { super.init(); Configuration taskConf = new Configuration(); Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); taskConf.set(FileOutputFormat.OUTDIR, stagingResultDir.toString()); ExecutionBlockId ebId = taskAttemptId.getTaskId().getExecutionBlockId(); writerContext = new TaskAttemptContextImpl(taskConf, new TaskAttemptID(ebId.getQueryId().toString(), ebId.getId(), TaskType.MAP, taskAttemptId.getTaskId().getId(), taskAttemptId.getId())); HFileOutputFormat2 hFileOutputFormat2 = new HFileOutputFormat2(); try {//from w w w . j a va 2 s .c o m writer = hFileOutputFormat2.getRecordWriter(writerContext); committer = new FileOutputCommitter(FileOutputFormat.getOutputPath(writerContext), writerContext); workingFilePath = committer.getWorkPath(); } catch (InterruptedException e) { throw new IOException(e.getMessage(), e); } LOG.info("Created hbase file writer: " + workingFilePath); }
From source file:org.apache.tez.mapreduce.examples.FilterLinesByWord.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); Credentials credentials = new Credentials(); boolean generateSplitsInClient = false; SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser(); try {//from w w w .ja va 2 s . c o m generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false); otherArgs = splitCmdLineParser.getRemainingArgs(); } catch (ParseException e1) { System.err.println("Invalid options"); printUsage(); return 2; } if (otherArgs.length != 3) { printUsage(); return 2; } String inputPath = otherArgs[0]; String outputPath = otherArgs[1]; String filterWord = otherArgs[2]; FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(outputPath))) { System.err.println("Output directory : " + outputPath + " already exists"); return 2; } TezConfiguration tezConf = new TezConfiguration(conf); fs.getWorkingDirectory(); Path stagingDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString()); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDir.toString()); TezClientUtils.ensureStagingDirExists(tezConf, stagingDir); String jarPath = ClassUtil.findContainingJar(FilterLinesByWord.class); if (jarPath == null) { throw new TezUncheckedException( "Could not find any jar containing" + FilterLinesByWord.class.getName() + " in the classpath"); } Path remoteJarPath = fs.makeQualified(new Path(stagingDir, "dag_job.jar")); fs.copyFromLocalFile(new Path(jarPath), remoteJarPath); FileStatus remoteJarStatus = fs.getFileStatus(remoteJarPath); TokenCache.obtainTokensForNamenodes(credentials, new Path[] { remoteJarPath }, conf); Map<String, LocalResource> commonLocalResources = new TreeMap<String, LocalResource>(); LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, remoteJarStatus.getLen(), remoteJarStatus.getModificationTime()); commonLocalResources.put("dag_job.jar", dagJarLocalRsrc); TezClient tezSession = TezClient.create("FilterLinesByWordSession", tezConf, commonLocalResources, credentials); tezSession.start(); // Why do I need to start the TezSession. Configuration stage1Conf = new JobConf(conf); stage1Conf.set(FILTER_PARAM_NAME, filterWord); Configuration stage2Conf = new JobConf(conf); stage2Conf.set(FileOutputFormat.OUTDIR, outputPath); stage2Conf.setBoolean("mapred.mapper.new-api", false); UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf); // Setup stage1 Vertex Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor .create(FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload)) .addTaskLocalFiles(commonLocalResources); DataSourceDescriptor dsd; if (generateSplitsInClient) { // TODO TEZ-1406. Dont' use MRInputLegacy stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath); stage1Conf.setBoolean("mapred.mapper.new-api", false); dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true); } else { dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath).groupSplits(false) .build(); } stage1Vertex.addDataSource("MRInput", dsd); // Setup stage2 Vertex Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(FilterByWordOutputProcessor.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), 1); stage2Vertex.addTaskLocalFiles(commonLocalResources); // Configure the Output for stage2 OutputDescriptor od = OutputDescriptor.create(MROutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)); OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(MROutputCommitter.class.getName()); stage2Vertex.addDataSink("MROutput", DataSinkDescriptor.create(od, ocd, null)); UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig .newBuilder(Text.class.getName(), TextLongPair.class.getName()).setFromConfiguration(tezConf) .build(); DAG dag = DAG.create("FilterLinesByWord"); Edge edge = Edge.create(stage1Vertex, stage2Vertex, edgeConf.createDefaultBroadcastEdgeProperty()); dag.addVertex(stage1Vertex).addVertex(stage2Vertex).addEdge(edge); LOG.info("Submitting DAG to Tez Session"); DAGClient dagClient = tezSession.submitDAG(dag); LOG.info("Submitted DAG to Tez Session"); DAGStatus dagStatus = null; String[] vNames = { "stage1", "stage2" }; try { while (true) { dagStatus = dagClient.getDAGStatus(null); if (dagStatus.getState() == DAGStatus.State.RUNNING || dagStatus.getState() == DAGStatus.State.SUCCEEDED || dagStatus.getState() == DAGStatus.State.FAILED || dagStatus.getState() == DAGStatus.State.KILLED || dagStatus.getState() == DAGStatus.State.ERROR) { break; } try { Thread.sleep(500); } catch (InterruptedException e) { // continue; } } while (dagStatus.getState() == DAGStatus.State.RUNNING) { try { ExampleDriver.printDAGStatus(dagClient, vNames); try { Thread.sleep(1000); } catch (InterruptedException e) { // continue; } dagStatus = dagClient.getDAGStatus(null); } catch (TezException e) { LOG.fatal("Failed to get application progress. Exiting"); return -1; } } dagStatus = dagClient.getDAGStatus(Sets.newHashSet(StatusGetOpts.GET_COUNTERS)); } finally { fs.delete(stagingDir, true); tezSession.stop(); } ExampleDriver.printDAGStatus(dagClient, vNames, true, true); LOG.info("Application completed. " + "FinalState=" + dagStatus.getState()); return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1; }
From source file:org.apache.tez.mapreduce.examples.FilterLinesByWordOneToOne.java
License:Apache License
@Override public int run(String[] otherArgs) throws Exception { boolean generateSplitsInClient = false; SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser(); try {// ww w.j a v a2 s. co m generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false); otherArgs = splitCmdLineParser.getRemainingArgs(); } catch (ParseException e1) { System.err.println("Invalid options"); printUsage(); return 2; } if (otherArgs.length != 3) { printUsage(); return 2; } String inputPath = otherArgs[0]; String outputPath = otherArgs[1]; String filterWord = otherArgs[2]; Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(outputPath))) { System.err.println("Output directory : " + outputPath + " already exists"); return 2; } TezConfiguration tezConf = new TezConfiguration(conf); fs.getWorkingDirectory(); Path stagingDir = new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString()); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDir.toString()); TezClientUtils.ensureStagingDirExists(tezConf, stagingDir); String jarPath = ClassUtil.findContainingJar(FilterLinesByWordOneToOne.class); if (jarPath == null) { throw new TezUncheckedException("Could not find any jar containing" + FilterLinesByWordOneToOne.class.getName() + " in the classpath"); } Path remoteJarPath = fs.makeQualified(new Path(stagingDir, "dag_job.jar")); fs.copyFromLocalFile(new Path(jarPath), remoteJarPath); FileStatus remoteJarStatus = fs.getFileStatus(remoteJarPath); Map<String, LocalResource> commonLocalResources = new TreeMap<String, LocalResource>(); LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath), LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, remoteJarStatus.getLen(), remoteJarStatus.getModificationTime()); commonLocalResources.put("dag_job.jar", dagJarLocalRsrc); TezClient tezSession = TezClient.create("FilterLinesByWordSession", tezConf, commonLocalResources, null); tezSession.start(); // Why do I need to start the TezSession. Configuration stage1Conf = new JobConf(conf); stage1Conf.set(FILTER_PARAM_NAME, filterWord); Configuration stage2Conf = new JobConf(conf); stage2Conf.set(FileOutputFormat.OUTDIR, outputPath); stage2Conf.setBoolean("mapred.mapper.new-api", false); UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf); // Setup stage1 Vertex Vertex stage1Vertex = Vertex.create("stage1", ProcessorDescriptor .create(FilterByWordInputProcessor.class.getName()).setUserPayload(stage1Payload)) .addTaskLocalFiles(commonLocalResources); DataSourceDescriptor dsd; if (generateSplitsInClient) { // TODO TEZ-1406. Dont' use MRInputLegacy stage1Conf.set(FileInputFormat.INPUT_DIR, inputPath); stage1Conf.setBoolean("mapred.mapper.new-api", false); dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, stagingDir, true); } else { dsd = MRInputLegacy.createConfigBuilder(stage1Conf, TextInputFormat.class, inputPath).groupSplits(false) .build(); } stage1Vertex.addDataSource("MRInput", dsd); // Setup stage2 Vertex Vertex stage2Vertex = Vertex.create("stage2", ProcessorDescriptor.create(FilterByWordOutputProcessor.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), dsd.getNumberOfShards()); stage2Vertex.addTaskLocalFiles(commonLocalResources); // Configure the Output for stage2 stage2Vertex.addDataSink("MROutput", DataSinkDescriptor.create( OutputDescriptor.create(MROutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(stage2Conf)), OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), null)); UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig .newBuilder(Text.class.getName(), TextLongPair.class.getName()).setFromConfiguration(tezConf) .build(); DAG dag = DAG.create("FilterLinesByWord"); Edge edge = Edge.create(stage1Vertex, stage2Vertex, edgeConf.createDefaultOneToOneEdgeProperty()); dag.addVertex(stage1Vertex).addVertex(stage2Vertex).addEdge(edge); LOG.info("Submitting DAG to Tez Session"); DAGClient dagClient = tezSession.submitDAG(dag); LOG.info("Submitted DAG to Tez Session"); DAGStatus dagStatus = null; String[] vNames = { "stage1", "stage2" }; try { while (true) { dagStatus = dagClient.getDAGStatus(null); if (dagStatus.getState() == DAGStatus.State.RUNNING || dagStatus.getState() == DAGStatus.State.SUCCEEDED || dagStatus.getState() == DAGStatus.State.FAILED || dagStatus.getState() == DAGStatus.State.KILLED || dagStatus.getState() == DAGStatus.State.ERROR) { break; } try { Thread.sleep(500); } catch (InterruptedException e) { // continue; } } while (dagStatus.getState() == DAGStatus.State.RUNNING) { try { ExampleDriver.printDAGStatus(dagClient, vNames); try { Thread.sleep(1000); } catch (InterruptedException e) { // continue; } dagStatus = dagClient.getDAGStatus(null); } catch (TezException e) { LOG.fatal("Failed to get application progress. Exiting"); return -1; } } } finally { fs.delete(stagingDir, true); tezSession.stop(); } ExampleDriver.printDAGStatus(dagClient, vNames); LOG.info("Application completed. " + "FinalState=" + dagStatus.getState()); return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1; }
From source file:org.apache.tez.mapreduce.examples.IntersectDataGen.java
License:Apache License
private byte[] createPayloadForOutput(Path outputPath, Configuration srcConf) throws IOException { Configuration conf = new Configuration(srcConf); conf.set(FileOutputFormat.OUTDIR, outputPath.toUri().toString()); byte[] payload = MROutput.createUserPayload(conf, TextOutputFormat.class.getName(), true); return payload; }
From source file:org.apache.tez.mapreduce.examples.IntersectExample.java
License:Apache License
private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath, int numPartitions) throws IOException { DAG dag = new DAG("IntersectExample"); // Configuration for src1 Configuration streamInputConf = new Configuration(tezConf); streamInputConf.set(FileInputFormat.INPUT_DIR, streamPath.toUri().toString()); byte[] streamInputPayload = MRInput.createUserPayload(streamInputConf, TextInputFormat.class.getName(), true, false);//from w ww .j a v a 2s .c om // Configuration for src2 Configuration hashInputConf = new Configuration(tezConf); hashInputConf.set(FileInputFormat.INPUT_DIR, hashPath.toUri().toString()); byte[] hashInputPayload = MRInput.createUserPayload(hashInputConf, TextInputFormat.class.getName(), true, false); // Configuration for intermediate output - shared by Vertex1 and Vertex2 // This should only be setting selective keys from the underlying conf. Fix after there's a // better mechanism to configure the IOs. UnorderedPartitionedKVEdgeConfigurer edgeConf = UnorderedPartitionedKVEdgeConfigurer.newBuilder( Text.class.getName(), NullWritable.class.getName(), HashPartitioner.class.getName(), null).build(); Configuration finalOutputConf = new Configuration(tezConf); finalOutputConf.set(FileOutputFormat.OUTDIR, outPath.toUri().toString()); byte[] finalOutputPayload = MROutput.createUserPayload(finalOutputConf, TextOutputFormat.class.getName(), true); // Change the way resources are setup - no MRHelpers Vertex streamFileVertex = new Vertex("partitioner1", new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf)) .addInput("streamfile", new InputDescriptor(MRInput.class.getName()).setUserPayload(streamInputPayload), MRInputAMSplitGenerator.class); Vertex hashFileVertex = new Vertex("partitioner2", new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf)) .addInput("hashfile", new InputDescriptor(MRInput.class.getName()).setUserPayload(hashInputPayload), MRInputAMSplitGenerator.class); Vertex intersectVertex = new Vertex("intersect", new ProcessorDescriptor(IntersectProcessor.class.getName()), numPartitions, MRHelpers.getReduceResource(tezConf)).addOutput("finalOutput", new OutputDescriptor(MROutput.class.getName()).setUserPayload(finalOutputPayload), MROutputCommitter.class); Edge e1 = new Edge(streamFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty()); Edge e2 = new Edge(hashFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty()); dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(intersectVertex).addEdge(e1) .addEdge(e2); return dag; }
From source file:org.apache.tez.mapreduce.examples.UnionExample.java
License:Apache License
private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException { DAG dag = DAG.create("UnionExample"); int numMaps = -1; Configuration inputConf = new Configuration(tezConf); inputConf.setBoolean("mapred.mapper.new-api", false); inputConf.set("mapred.input.format.class", TextInputFormat.class.getName()); inputConf.set(FileInputFormat.INPUT_DIR, inputPath); MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null); DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build(); Vertex mapVertex1 = Vertex/* www .ja v a 2s . c o m*/ .create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps) .addDataSource("MRInput", dataSource); Vertex mapVertex2 = Vertex .create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps) .addDataSource("MRInput", dataSource); Vertex mapVertex3 = Vertex .create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps) .addDataSource("MRInput", dataSource); Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1); Configuration outputConf = new Configuration(tezConf); outputConf.setBoolean("mapred.reducer.new-api", false); outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName()); outputConf.set(FileOutputFormat.OUTDIR, outputPath); DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build(); checkerVertex.addDataSink("union", od); Configuration allPartsConf = new Configuration(tezConf); DataSinkDescriptor od2 = MROutput .createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build(); checkerVertex.addDataSink("all-parts", od2); Configuration partsConf = new Configuration(tezConf); DataSinkDescriptor od1 = MROutput .createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build(); VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2); unionVertex.addDataSink("parts", od1); OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig .newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()) .build(); dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex) .addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())) .addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName()))); return dag; }
From source file:org.pentaho.hadoop.shim.common.CommonFormatShimTestIT.java
License:Apache License
@Test public void testParquetWriteSuccessLocalFileSystem() throws Exception { final String PARQUET_FILE_NAME = "test.parquet"; String tempFile = Files.createTempDirectory("parquet").toUri().toString(); ConfigurationProxy jobConfiguration = new ConfigurationProxy(); jobConfiguration.set(FileOutputFormat.OUTDIR, tempFile); String parquetFilePath = jobConfiguration.get(FileOutputFormat.OUTDIR) + PARQUET_FILE_NAME; PentahoParquetOutputFormat pentahoParquetOutputFormat = new PentahoParquetOutputFormat(); pentahoParquetOutputFormat.setOutputFile(parquetFilePath, true); pentahoParquetOutputFormat.setFields(ParquetUtils.createOutputFields(ParquetSpec.DataType.INT_64)); IPentahoRecordWriter recordWriter = pentahoParquetOutputFormat.createRecordWriter(); RowMetaAndData rowInput = new RowMetaAndData(); RowMeta rowMeta = new RowMeta(); rowMeta.addValueMeta(new ValueMetaString("Name")); rowMeta.addValueMeta(new ValueMetaString("Age")); rowInput.setRowMeta(rowMeta);// w ww . j a va 2s . c o m rowInput.setData(new Object[] { "Andrey", "11" }); recordWriter.write(rowInput); recordWriter.close(); IPentahoRecordReader recordReader = readCreatedParquetFile(parquetFilePath); Object[] rowInputArr = new Object[] { rowInput.getData()[0].toString(), Long.parseLong(rowInput.getData()[1].toString()) }; recordReader.forEach( rowMetaAndData -> org.junit.Assert.assertArrayEquals(rowMetaAndData.getData(), rowInputArr)); }
From source file:org.shaf.core.io.emulator.RecordWriterFactoryTest.java
License:Apache License
/** * Run the record reader factory tests./* ww w .j a va2 s. c om*/ * * @throws Exception * if the test fails for some reason. */ @Test public void testFactory() throws Exception { Configuration config = job.getConfiguration(); config.set(FileOutputFormat.OUTDIR, super.dir.toString()); config.set(MRJobConfig.OUTPUT_KEY_CLASS, "org.apache.hadoop.io.NullWritable"); config.set(MRJobConfig.OUTPUT_VALUE_CLASS, "org.apache.hadoop.io.NullWritable"); assertEquals(TextWriter.class, RecordWriterFactory.createRecordWriter(SomeProcess1.class, job.getConfiguration()).getClass()); assertEquals(SequenceWriter.class, RecordWriterFactory.createRecordWriter(SomeProcess2.class, job.getConfiguration()).getClass()); }