List of usage examples for org.apache.hadoop.mapreduce.lib.output MultipleOutputs addNamedOutput
@SuppressWarnings("unchecked") public static void addNamedOutput(Job job, String namedOutput, Class<? extends OutputFormat> outputFormatClass, Class<?> keyClass, Class<?> valueClass)
From source file:com.talis.mapreduce.dicenc.SecondDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from w w w .ja v a 2 s . co m*/ Job job = new Job(getConf(), "second"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(SecondMapper.class); job.setReducerClass(SecondReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); MultipleOutputs.addNamedOutput(job, "dict", TextOutputFormat.class, Text.class, Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.tdunning.plume.local.lazy.MapRedExecutor.java
License:Apache License
/** * This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of * the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration. * //from w w w. ja v a 2s. co m * @param mscr The MSCR to convert * @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers * @param outputPath The output path of the MapRed job * @return A hadoop-executable MapRed Job * * @throws IOException */ static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException { Configuration conf = new Configuration(); conf.set(WORKFLOW_NAME, workFlow.getClass().getName()); conf.setInt(MSCR_ID, mscr.getId()); conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath); Job job = new Job(conf, "MSCR"); // TODO deprecation job.setMapOutputKeyClass(PlumeObject.class); job.setMapOutputValueClass(PlumeObject.class); job.setJarByClass(MapRedExecutor.class); /** * Define multiple inputs */ for (PCollection<?> input : mscr.getInputs()) { if (!(input instanceof LazyCollection)) { throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable"); } LazyCollection<Text> l = (LazyCollection<Text>) input; if (!(l.isMaterialized() && l.getFile() != null)) { // Collections have plume ID only if they are intermediate results - TODO better naming for this if (l.getPlumeId().length() < 1) { throw new IllegalArgumentException( "Can't create MapRed from MSCR inputs that are not materialized to a file"); } } PCollectionType<?> rType = l.getType(); Class<? extends InputFormat> format = SequenceFileInputFormat.class; if (rType instanceof PTableType) { PTableType<?, ?> tType = (PTableType<?, ?>) rType; if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) { format = KeyValueTextInputFormat.class; } MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class); } else { if (rType.elementType() instanceof StringType) { format = TextInputFormat.class; } MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class); } } /** * Define multiple outputs */ FileOutputFormat.setOutputPath(job, new Path(outputPath)); for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) { PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output) .getType(); if (rType instanceof PTableType) { PTableType<?, ?> tType = (PTableType<?, ?>) rType; Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class; if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) { outputFormat = TextOutputFormat.class; } MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType())); } else { Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class; if (rType.elementType() instanceof StringType) { outputFormat = TextOutputFormat.class; } MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType())); } } /** * Define Reducer & Combiner */ job.setCombinerClass(MSCRCombiner.class); job.setReducerClass(MSCRReducer.class); job.setNumReduceTasks(1); return job; }
From source file:com.wipro.ats.bdre.dq.DQDriver.java
License:Apache License
@Override public int run(String[] arg) throws Exception { String processId = arg[0];/*from www . j ava 2s . c o m*/ String sPath = arg[1]; String destDir = arg[2]; Properties props = new GetProperties().getProperties(processId, "dq"); LOGGER.debug("props=" + props); Configuration conf = getConf(); conf.set("dq.process.id", processId); Job job = Job.getInstance(conf); job.setJobName("Data Quality " + processId); job.setJarByClass(DQDriver.class); job.setMapperClass(DQMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer is not required job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path inputFilePath = new Path(sPath); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir)); MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class, NullWritable.class); if (!job.waitForCompletion(true)) { return 1; } Path outputDir = new Path(destDir); FileSystem srcFs = outputDir.getFileSystem(getConf()); FileSystem destFs = outputDir.getFileSystem(getConf()); //Valid Records Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR); //Input and quality filtered file should have same name (but different path) Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName()); if (srcFs.exists(goodFilesSrcDir)) { FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, ""); } // Invalid Records Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR); Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE); if (srcFs.exists(badFilesSrcDir)) { FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, ""); } // Preparing report aggregation job Job fileReportAggregationJob = Job.getInstance(conf); fileReportAggregationJob.setJobName("File Report Computing " + processId); fileReportAggregationJob.setJarByClass(DQMain.class); fileReportAggregationJob.setMapperClass(DQFileReportMapper.class); fileReportAggregationJob.setMapOutputKeyClass(Text.class); fileReportAggregationJob.setMapOutputValueClass(IntWritable.class); fileReportAggregationJob.setReducerClass(DQFileReportReducer.class); fileReportAggregationJob.setOutputKeyClass(Text.class); fileReportAggregationJob.setOutputValueClass(Text.class); fileReportAggregationJob.setNumReduceTasks(1); Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR); Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir); FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir); if (!fileReportAggregationJob.waitForCompletion(true)) { return 1; } // Merge Report Records MR stuffs Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE); FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, ""); Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE); //Read the report file from HDFS and report the percentage DQStats dqStats = getQualityStats(getConf(), reportDestFile); LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent()); props = new GetProperties().getProperties(processId, "dq"); String strThreshold = props.getProperty("min.pass.threshold.percent"); float threshold = Float.parseFloat(strThreshold); dqStats.setThreshold(threshold); //Update the result in metadata logResult(dqStats, processId, 0L); if (dqStats.getGoodPercent() < threshold) { LOGGER.error("DQ check did not pass"); throw new DQValidationException(dqStats); } LOGGER.info(dqStats); FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile); String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString(); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash(fileHash); registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen()); registerFileInfo.setPath(goodDestFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.yahoo.labs.yamall.hadoop.Test.java
License:Open Source License
/** * Run the map/reduce job/*from w w w .j a va 2 s . co m*/ */ public final int run(final String[] args) throws Exception { startLogger(Level.INFO); Configuration conf = getConf(); conf.set("yamall.vw_model", args[2]); conf.setIfUnset("yamall.bit_precision", "18"); conf.setIfUnset("yamall.parser", "vw"); // Print to screen all the options TreeMap<String, String> map = new TreeMap<String, String>(); for (Map.Entry<String, String> entry : conf) { map.put(entry.getKey(), entry.getValue()); } for (Map.Entry<String, String> entry : map.entrySet()) { System.out.printf("%s=%s\n", entry.getKey(), entry.getValue()); } Job job = Job.getInstance(conf, "Yamall Test on MapReduce"); job.setNumReduceTasks(1); job.setJarByClass(Test.class); job.setMapperClass(TestMapper.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setReducerClass(TestReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(CompositeDoubleTextWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); MultipleOutputs.addNamedOutput(job, "out", TextOutputFormat.class, NullWritable.class, Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation/* w ww .j av a 2s . co m*/ */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(AssocReducer.MIN_VALUE, minValue); conf.setInt("mapred.job.map.memory.mb", 1280); conf.setInt("mapred.job.reduce.memory.mb", 2560); conf.set("mapred.reduce.child.java.opts", "-Xmx2G"); conf.setInt("mapred.task.timeout", 6000000); conf.set(AssocReducer.ASSOC_METRIC, "llr"); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr"); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.setReducerClass(AssocReducer.class); job.setNumReduceTasks(reduceTasks); // Defines additional single text based output 'text' for the job MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class); // Defines additional multi sequencefile based output 'sequence' for the // job MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:edu.umn.cs.sthadoop.hdfs.KNNJoin.java
License:Open Source License
static void knnJoinMapReduce(OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { final Path[] inputPaths = params.getInputPaths(); Path outputPath = params.getOutputPath(); //final int k = params.getInt("k", 1); KNNJRecordReader.params = params;/*from w ww .j a v a2 s . c o m*/ //System.out.println(params.getInputPaths().length); long t1 = System.currentTimeMillis(); // phase 1 params.set("type", "phase1"); Job job = Job.getInstance(params, "KNNJoin Phase1"); job.setJarByClass(KNNJoin.class); job.setInputFormatClass(KNNJInputFormat.class); KNNJInputFormat.setInputPaths(job, inputPaths[0], inputPaths[1]); job.setMapperClass(KNNJMap.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job, outputPath); MultipleOutputs.addNamedOutput(job, "phase2", TextOutputFormat3.class, Text.class, Text.class); // Submit the job if (job.waitForCompletion(true)) { LOG.info("[stat:job[0]"); } else { LOG.info("[stat:job[1]"); return; } long t2 = System.currentTimeMillis() - t1; t1 = System.currentTimeMillis(); Counters counters = job.getCounters(); long refSplits = counters.findCounter(KNNJMap.Stats.refSplits).getValue(); long qSplits = counters.findCounter(KNNJMap.Stats.qSplits).getValue(); long numRefRecs = counters.findCounter(KNNJMap.Stats.numRefRecs).getValue(); long numQRecs = counters.findCounter(KNNJMap.Stats.numQRecs).getValue(); long numP2Recs = counters.findCounter(KNNJMap.Stats.phase2Recs).getValue(); String str = String.format( "stat:counters[refSplits=%s;qSplits=%s;numRefRecs=%s;" + "numQRecs=%s;numP2Recs=%s;t1=%s]", refSplits, qSplits, numRefRecs, numQRecs, numP2Recs, t2); LOG.info(str); // LOG.info("[stat:counter:refSplits="+refSplits+"]"); // LOG.info("[stat:counter:qSplits="+qSplits+"]"); // LOG.info("[stat:counter:numRefRecs="+numRefRecs+"]"); // LOG.info("[stat:counter:numQRecs="+numQRecs+"]"); // LOG.info("[stat:counter:numP2Recs="+numP2Recs+"]"); /* * for (Iterator<String> iterator = counters.getGroupNames().iterator(); * iterator.hasNext();) { * String str = (String) iterator.next(); * LOG.info("[stat:counter="+str+"]"); * } */ // end of phase 1 // phase 2 /*params.set("type", "phase2"); Job job2 = Job.getInstance(params, "KNNJoin Phase2"); job2.setJarByClass(KNNJoin.class); job2.setMapperClass(TokenizerMapper.class); job2.setReducerClass(GroupingReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileSystem outputFS = outputPath.getFileSystem(params); Path p2OutPath; do { p2OutPath = new Path(outputPath.getParent(), outputPath.getName() + ".knnj_" + (int) (Math.random() * 1000000)); } while (outputFS.exists(p2OutPath)); FileSystem p2OutPathFS = FileSystem.get(p2OutPath.toUri(), params); job2.setInputFormatClass(KNNJInputFormatPhase2.class); KNNJInputFormatPhase2.setInputPaths(job2, outputPath); job2.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job2, p2OutPath); MultipleOutputs.addNamedOutput(job2, "phase3", TextOutputFormat3.class, NullWritable.class, Text.class); // Submit the job * if (job2.waitForCompletion(true)) { * LOG.info("Job2 succeeded."); * } else { * LOG.info("Job2 failed."); * return; * } // end of phase 2 t2 = System.currentTimeMillis() - t1; LOG.info("[stat:time:2=" + t2 + "]"); t1 = System.currentTimeMillis(); // phase 3 params.set("type", "phase3"); Job job3 = Job.getInstance(params, "KNNJoin Phase3"); job3.setJarByClass(KNNJoin.class); job3.setMapperClass(KNNJMapPhase3.class); job3.setOutputKeyClass(NullWritable.class); job3.setOutputValueClass(Text.class); job3.setNumReduceTasks(0); Path p3OutPath; do { p3OutPath = new Path(outputPath.getParent(), outputPath.getName() + ".knnj_" + (int) (Math.random() * 1000000)); } while (outputFS.exists(p3OutPath)); FileSystem p3OutPathFS = FileSystem.get(p3OutPath.toUri(), params); job3.setInputFormatClass(KNNJInputFormatPhase3.class); KNNJInputFormatPhase3.setInputPaths(job3, p2OutPath, inputPaths[1]); job3.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job3, p3OutPath); // Submit the job * if (job3.waitForCompletion(true)) { * LOG.info("Job3 succeeded."); * } else { * LOG.info("Job3 failed."); * return; * } // end of phase 3 // cleaning temporary dirs and files p2OutPathFS.delete(p2OutPath, true); p3OutPathFS.delete(p3OutPath, true); t2 = System.currentTimeMillis() - t1; LOG.info("[stat:time:3=" + t2 + "]");*/ }
From source file:edu.umn.cs.sthadoop.operations.STJoins.java
License:Open Source License
static void JoinMapReduce(OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { final Path[] inputPaths = params.getInputPaths(); Path outputPath = params.getOutputPath(); //final int k = params.getInt("k", 1); HdfsRecordReader.params = params;/*w ww.java2 s .c om*/ //System.out.println(params.getInputPaths().length); long t1 = System.currentTimeMillis(); // phase 1 params.set("type", "phase1"); Job job = Job.getInstance(params, "ST-Join Phase1"); job.setJarByClass(STJoinsMapper.class); job.setInputFormatClass(HdfsInputFormat.class); HdfsInputFormat.setInputPaths(job, inputPaths[0], inputPaths[1]); job.setMapperClass(STJoinsMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job, outputPath); MultipleOutputs.addNamedOutput(job, "phase2", TextOutputFormat3.class, Text.class, Text.class); // Submit the job if (job.waitForCompletion(true)) { LOG.info("[stat:job[0]"); } else { LOG.info("[stat:job[1]"); return; } long t2 = System.currentTimeMillis() - t1; t1 = System.currentTimeMillis(); Counters counters = job.getCounters(); long refSplits = counters.findCounter(STJoinsMapper.Stats.refSplits).getValue(); long qSplits = counters.findCounter(STJoinsMapper.Stats.qSplits).getValue(); long numRefRecs = counters.findCounter(STJoinsMapper.Stats.numRefRecs).getValue(); long numQRecs = counters.findCounter(STJoinsMapper.Stats.numQRecs).getValue(); long numP2Recs = counters.findCounter(STJoinsMapper.Stats.phase2Recs).getValue(); String str = String.format( "stat:counters[refSplits=%s;qSplits=%s;numRefRecs=%s;" + "numQRecs=%s;numP2Recs=%s;t1=%s]", refSplits, qSplits, numRefRecs, numQRecs, numP2Recs, t2); LOG.info(str); // LOG.info("[stat:counter:refSplits="+refSplits+"]"); // LOG.info("[stat:counter:qSplits="+qSplits+"]"); // LOG.info("[stat:counter:numRefRecs="+numRefRecs+"]"); // LOG.info("[stat:counter:numQRecs="+numQRecs+"]"); // LOG.info("[stat:counter:numP2Recs="+numP2Recs+"]"); /* * for (Iterator<String> iterator = counters.getGroupNames().iterator(); * iterator.hasNext();) { * String str = (String) iterator.next(); * LOG.info("[stat:counter="+str+"]"); * } */ // end of phase 1 // phase 2 /*params.set("type", "phase2"); Job job2 = Job.getInstance(params, "KNNJoin Phase2"); job2.setJarByClass(KNNJoin.class); job2.setMapperClass(TokenizerMapper.class); job2.setReducerClass(GroupingReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileSystem outputFS = outputPath.getFileSystem(params); Path p2OutPath; do { p2OutPath = new Path(outputPath.getParent(), outputPath.getName() + ".knnj_" + (int) (Math.random() * 1000000)); } while (outputFS.exists(p2OutPath)); FileSystem p2OutPathFS = FileSystem.get(p2OutPath.toUri(), params); job2.setInputFormatClass(KNNJInputFormatPhase2.class); KNNJInputFormatPhase2.setInputPaths(job2, outputPath); job2.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job2, p2OutPath); MultipleOutputs.addNamedOutput(job2, "phase3", TextOutputFormat3.class, NullWritable.class, Text.class); // Submit the job * if (job2.waitForCompletion(true)) { * LOG.info("Job2 succeeded."); * } else { * LOG.info("Job2 failed."); * return; * } // end of phase 2 t2 = System.currentTimeMillis() - t1; LOG.info("[stat:time:2=" + t2 + "]"); t1 = System.currentTimeMillis(); // phase 3 params.set("type", "phase3"); Job job3 = Job.getInstance(params, "KNNJoin Phase3"); job3.setJarByClass(KNNJoin.class); job3.setMapperClass( STJoinsMapperPhase3.class); job3.setOutputKeyClass(NullWritable.class); job3.setOutputValueClass(Text.class); job3.setNumReduceTasks(0); Path p3OutPath; do { p3OutPath = new Path(outputPath.getParent(), outputPath.getName() + ".knnj_" + (int) (Math.random() * 1000000)); } while (outputFS.exists(p3OutPath)); FileSystem p3OutPathFS = FileSystem.get(p3OutPath.toUri(), params); job3.setInputFormatClass(KNNJInputFormatPhase3.class); KNNJInputFormatPhase3.setInputPaths(job3, p2OutPath, inputPaths[1]); job3.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job3, p3OutPath); // Submit the job * if (job3.waitForCompletion(true)) { * LOG.info("Job3 succeeded."); * } else { * LOG.info("Job3 failed."); * return; * } // end of phase 3 // cleaning temporary dirs and files p2OutPathFS.delete(p2OutPath, true); p3OutPathFS.delete(p3OutPath, true); t2 = System.currentTimeMillis() - t1; LOG.info("[stat:time:3=" + t2 + "]");*/ }
From source file:eu.edisonproject.classification.tfidf.mapreduce.CompetencesDistanceDriver.java
License:Apache License
@Override public int run(String[] args) { try {//from ww w . j a v a2s . c om Configuration conf = HBaseConfiguration.create(); //additional output using TextOutputFormat. conf.set("file.names", args[3]); Job job = Job.getInstance(conf); //TableMapReduceUtil.addDependencyJars(job); job.setJarByClass(CompetencesDistanceDriver.class); //This row must be changed job.setJobName("Words Group By Title Driver"); Path inPath = new Path(args[0]); Path outPath = new Path(args[1]); Path competencesPath = new Path(args[2]); Path competencesPathHDFS = competencesPath; FileSystem fs = FileSystem.get(conf); if (!conf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { competencesPathHDFS = new Path(competencesPath.getName()); if (!fs.exists(competencesPathHDFS)) { fs.mkdirs(competencesPathHDFS); File[] stats = new File(competencesPath.toString()).listFiles(); for (File stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("csv")) { Path dest = new Path(competencesPathHDFS.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } } } job.addCacheFile(competencesPathHDFS.toUri()); FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, outPath); fs.delete(outPath, true); job.setMapperClass(CompetencesDistanceMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(CompetencesDistanceReducer.class); // job.setOutputFormatClass(TableOutputFormat.class); // job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "jobpostcompetence"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); String[] fileNames = args[3].split(","); for (String n : fileNames) { MultipleOutputs.addNamedOutput(job, n, TextOutputFormat.class, Text.class, Text.class); } return (job.waitForCompletion(true) ? 0 : 1); } catch (IOException | IllegalStateException | IllegalArgumentException | InterruptedException | ClassNotFoundException ex) { Logger.getLogger(CompetencesDistanceDriver.class.getName()).log(Level.SEVERE, null, ex); } return 0; }
From source file:eu.scape_project.archiventory.Archiventory.java
License:Apache License
public static void startHadoopJob(Configuration conf) { try {/*from w w w.jav a 2 s .c om*/ Job job = new Job(conf, "archiventory"); // local debugging (pseudo-distributed) // job.getConfiguration().set("mapred.job.tracker", "local"); // job.getConfiguration().set("fs.default.name", "file:///"); job.setJarByClass(Archiventory.class); job.setMapperClass(Archiventory.ContainerItemIdentificationMapper.class); job.setReducerClass(Archiventory.ContainerItemIdentificationReducer.class); job.setInputFormatClass(TextInputFormat.class); // tabular output of identification results MultipleOutputs.addNamedOutput(job, "idtab", TextOutputFormat.class, Text.class, Text.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ObjectWritable.class); TextInputFormat.addInputPath(job, new Path(config.getDirStr())); String outpath = "output/" + System.currentTimeMillis(); FileOutputFormat.setOutputPath(job, new Path(outpath)); job.waitForCompletion(true); System.out.print(outpath); System.exit(0); } catch (Exception e) { logger.error("I/O error", e); } }
From source file:eu.scape_project.spacip.Spacip.java
License:Apache License
/** * Start Hadoop job// w w w . ja va 2 s. co m * * @param conf Hadoop job configuration */ public static void startHadoopJob(Configuration conf) { try { Job job = new Job(conf, "spacip_" + conf.getInt("num_items_per_task", 0)); // local debugging (pseudo-distributed) // job.getConfiguration().set("mapred.job.tracker", "local"); // job.getConfiguration().set("fs.default.name", "file:///"); job.setJarByClass(Spacip.class); job.setMapperClass(Spacip.ContainerProcessingMapper.class); // No reducer needed job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); MultipleOutputs.addNamedOutput(job, "keyfilmapping", TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addNamedOutput(job, "tomarinput", TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addNamedOutput(job, "error", TextOutputFormat.class, Text.class, Text.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ObjectWritable.class); TextInputFormat.addInputPath(job, new Path(config.getDirStr())); String outpath = StringUtils.normdir(conf.get("joboutput_hdfs_path", "spacip_joboutput")) + System.currentTimeMillis(); FileOutputFormat.setOutputPath(job, new Path(outpath)); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); job.waitForCompletion(true); // print output path (taverna integration) System.out.print(outpath); System.exit(0); } catch (Exception e) { logger.error("I/O error", e); } }