List of usage examples for org.apache.hadoop.mapred.lib MultipleInputs addInputPath
public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass)
From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java
License:Apache License
public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: urlresolution <url-map> <url-register> <out>"); System.exit(2);//from w w w . j a va 2 s . c om } JobConf job = new JobConf(conf); FileSystem fS = FileSystem.get(conf); fS.delete(new Path(otherArgs[2]), true); MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class); MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class); job.setJarByClass(HadoopUrlResolution.class); job.setPartitionerClass(KeyPartitioner.class); job.setOutputValueGroupingComparator(GroupingComparator.class); job.setMapOutputKeyClass(UrlRegJoinUrlMap.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); Job j = new Job(job); j.setReducerClass(Reduce.class); j.waitForCompletion(true); }
From source file:com.ebay.erl.mobius.core.mapred.MobiusMultiInputs.java
License:Apache License
public static void addInputPath(JobConf conf, Path anInput, Class<? extends InputFormat> inputFormatClass, Class<? extends AbstractMobiusMapper> mapperClass, byte datasetID, FileSystem fs) throws IOException { MultipleInputs.addInputPath(conf, anInput, inputFormatClass, mapperClass); // override the {@link InputFormat} class set by the {@link MultipleInputs} // as Mobius need to set the set the current dataset id per input split. conf.setInputFormat(MobiusDelegatingInputFormat.class); // MobiusDelegatingInputFormat extends DelegatingInputFormat, which always // call the FileInpupt#setInputs within DelegatingInputFormat#getInputs // regardless of the actual type of <code>inputFormatClass</code>. ///////////////////////////////////////////////////// // start to build the path to dataset ID mapping ///////////////////////////////////////////////////// MultiInputsHelper helper = MultiInputsHelpersRepository.getInstance(conf).getHelper(inputFormatClass); URI uri = helper.getUniquePathByInputFormat(conf, anInput); String aPath = uri.toString(); if (aPath.indexOf(";") >= 0) throw new IllegalArgumentException(aPath + " cannot contains semicolon"); // set the input path to datasetID mapping in the Hadoop configuration. if (conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "").isEmpty()) { conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath); } else {/* w w w . j av a 2s .c o m*/ String previous = conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING); conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath + "," + previous); } //LOGGER.debug(conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "")); }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (!createJobConfAndParseArgs(args)) { return 0; }/*w ww . j a v a2 s.co m*/ setFileSystem(FileSystem.get(job)); FileStatus status = fs.getFileStatus(srcDir); if (null == status || !status.isDir()) { throw new IllegalArgumentException("No such directory: " + srcDir); } if (Mode.STAND_ALONE == mode) { standAlone(); } else { writeDirs(); MultipleInputs.addInputPath(job, bucketFiles, SequenceFileInputFormat.class, IdentityMapper.class); MultipleInputs.addInputPath(job, counters, CountersInputFormat.class, CountersMapper.class); job.setPartitionerClass(CrushPartitioner.class); job.setReducerClass(CrushReducer.class); job.setOutputKeyComparatorClass(Text.Comparator.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormat(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, bucketFiles); FileOutputFormat.setOutputPath(job, outDir); job.set("crush.partition.map", partitionMap.toString()); if (0 != nBuckets) { print(Verbosity.INFO, "\n\nInvoking map reduce\n\n"); RunningJob completed = JobClient.runJob(job); jobCounters = completed.getCounters(); } long eligible = jobCounters.getCounter(MapperCounter.FILES_ELIGIBLE); long crushed = jobCounters.getCounter(ReducerCounter.FILES_CRUSHED); /* * There's no way this cannot hold true if Hadoop is working correctly. */ if (eligible != crushed) { throw new AssertionError(format("Files eligible (%d) != files crushed (%d)", eligible, crushed)); } if (Mode.CLONE == mode) { cloneOutput(); } else { moveOutput(); } } print(Verbosity.INFO, "\n\nDeleting temporary directory"); fs.delete(tmpDir, true); /* * If we have printed anything to the console at all, then add a line wrap to bring the cursor back to the beginning. */ print(Verbosity.INFO, "\n\n"); return 0; }
From source file:findstableweatherstate.FindStableWeatherState.java
public String call() throws Exception { Path firstOutputPath = new Path("input/firstOutput"); Path secondOutputPath = new Path("input/secondOutput"); long startTime, stopTime, elapsedTime; JobConf job = new JobConf(); job.setJarByClass(getClass());// w w w . ja v a2s . c o m job.setJobName("invertedindex"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setReducerClass(JoinReducer.class); MultipleInputs.addInputPath(job, new Path(getInputPathStation()), TextInputFormat.class, StationMapper.class); MultipleInputs.addInputPath(job, new Path(getInputPathReadings()), TextInputFormat.class, ReadingsMapper.class); FileOutputFormat.setOutputPath(job, firstOutputPath); JobConf job2 = new JobConf(); job2.setJarByClass(getClass()); job2.setJobName("secondJob"); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); //job2.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); FileInputFormat.setInputPaths(job2, firstOutputPath); job2.setMapperClass(CalculateMinMaxTemperatureMapper.class); job2.setReducerClass(CalculateMaxMinTemperatureReducer.class); if (getOutputPath() != null) { FileOutputFormat.setOutputPath(job2, secondOutputPath); } JobConf job3 = new JobConf(); job3.setJarByClass(getClass()); job3.setJobName("thirdJob"); job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(Text.class); job3.setMapOutputKeyClass(DoubleWritable.class); job3.setMapOutputValueClass(Text.class); //job2.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); FileInputFormat.setInputPaths(job3, secondOutputPath); job3.setMapperClass(SortStateMapper.class); job3.setReducerClass(SortStateReducer.class); if (getOutputPath() != null) { FileOutputFormat.setOutputPath(job3, new Path(getOutputPath())); } startTime = System.currentTimeMillis(); JobClient.runJob(job); stopTime = System.currentTimeMillis(); elapsedTime = stopTime - startTime; System.out.println("******************** First Job : " + elapsedTime / 1000); startTime = System.currentTimeMillis(); JobClient.runJob(job2); stopTime = System.currentTimeMillis(); elapsedTime = stopTime - startTime; System.out.println("******************** Second Job : " + elapsedTime / 1000); startTime = System.currentTimeMillis(); JobClient.runJob(job3); stopTime = System.currentTimeMillis(); elapsedTime = stopTime - startTime; System.out.println("******************** Third Job : " + elapsedTime / 1000); return ""; }
From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java
License:Apache License
/** * Creates a JobConf for a Job that will merge the unique listeners and track statistics. * //from ww w . ja v a 2s .co m * @param outputPath The path for the results to be output to. * @param sumInputDir The path containing the data from the sum Job. * @param listenersInputDir The path containing the data from the unique listeners job. * @return The merge JobConf. */ private JobConf getMergeConf(Path outputPath, Path sumInputDir, Path listenersInputDir) { log.info("Creating configuration for merge job"); JobConf conf = new JobConf(TrackStatisticsProgram.class); conf.setOutputKeyClass(IntWritable.class); // track id conf.setOutputValueClass(TrackStats.class); // overall track statistics conf.setCombinerClass(SumReducer.class); // safe to re-use reducer as a combiner here conf.setReducerClass(SumReducer.class); conf.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(conf, outputPath); MultipleInputs.addInputPath(conf, sumInputDir, SequenceFileInputFormat.class, IdentityMapper.class); MultipleInputs.addInputPath(conf, listenersInputDir, SequenceFileInputFormat.class, MergeListenersMapper.class); conf.setJobName("merge"); return conf; }
From source file:hibench.DataGenerator.java
License:Apache License
public void replaceIds(Path fcontent, Path fids, Path fjoin, ZipfRandom zipf) throws IOException { LOG.info("Replace Virtual Zipfian Ids with real Ids..."); JobConf job = new JobConf(WebDataGen.class); String jobname = fcontent.getName() + " JOIN " + fids.getName() + " -> " + fjoin.getName(); job.setJobName(jobname);//from ww w . ja v a 2 s . c o m job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); MultipleInputs.addInputPath(job, fids, TextInputFormat.class, TagRecordsMapper.class); MultipleInputs.addInputPath(job, fcontent, TextInputFormat.class, ReverseContentMapper.class); job.setOutputFormat(TextOutputFormat.class); // use combiner to avoid too many inputs for reducer job.setCombinerClass(ConcatTextCombiner.class); job.setReducerClass(JoinContentWithZipfReducer.class); if (zipf.reds > 0) { job.setNumReduceTasks(zipf.reds); } else { job.setNumReduceTasks(DataOptions.getMaxNumReduce()); } FileOutputFormat.setOutputPath(job, fjoin); LOG.info("Running Job: " + jobname); LOG.info("Zipfian Id distribution: " + fids); LOG.info("Content file with virtual Ids: " + fcontent); LOG.info("Joint result file: " + fjoin); JobClient.runJob(job); LOG.info("Finished Running Job: " + jobname); }
From source file:hibench.HiveDataGenerator.java
License:Apache License
private void createRankingsTable() throws IOException { LOG.info("Creating table rankings..."); JobConf job = new JobConf(WebDataGen.class); String jobname = "Create " + paths.dname + " rankings"; job.setJobName(jobname);/*from w w w. j av a 2s . c o m*/ job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapOutputKeyClass(Text.class); job.setCombinerClass(ConcatTextCombiner.class); job.setReducerClass(CountRankingAndReplaceIdReducer.class); if (options.reds > 0) { job.setNumReduceTasks(options.reds); } else { job.setNumReduceTasks(DataOptions.getMaxNumReduce()); } // job.setNumReduceTasks(options.agents/2); /*** * need to join result with LINK table so that to replace * url ids with real contents */ MultipleInputs.addInputPath(job, paths.getPath(DataPaths.T_LINK_PAGE), TextInputFormat.class, MyIdentityMapper.class); MultipleInputs.addInputPath(job, paths.getPath(DataPaths.LINKS), TextInputFormat.class, TagRecordsMapper.class); if (options.SEQUENCE_OUT) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.codecClass) { job.set("mapred.output.compression.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.codecClass); } FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.RANKINGS)); LOG.info("Running Job: " + jobname); LOG.info("Table link-page file " + paths.getPath(DataPaths.T_LINK_PAGE) + " as input"); LOG.info("Links file " + paths.getResult(DataPaths.LINKS) + " as output"); LOG.info("Ouput file " + paths.getResult(DataPaths.RANKINGS)); JobClient.runJob(job); LOG.info("Finished Running Job: " + jobname); LOG.info("Cleaning temp files..."); paths.cleanTempFiles(paths.getResult(DataPaths.RANKINGS)); }
From source file:hibench.HiveDataGenerator.java
License:Apache License
private void createUserVisitsTable() throws IOException, URISyntaxException { LOG.info("Creating user visits..."); JobConf job = new JobConf(WebDataGen.class); String jobname = "Create " + paths.dname + " uservisits"; job.setJobName(jobname);/*from w w w. j a va 2 s . c o m*/ /*** * Set distributed cache file for table generation, * cache files include: * 1. user agents * 2. country code and language code * 3. search keys */ DistributedCache.addCacheFile(paths.getPath(DataPaths.uagentf).toUri(), job); DistributedCache.addCacheFile(paths.getPath(DataPaths.countryf).toUri(), job); DistributedCache.addCacheFile(paths.getPath(DataPaths.searchkeyf).toUri(), job); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setMapOutputKeyClass(Text.class); visit.setJobConf(job); job.setInputFormat(TextInputFormat.class); MultipleInputs.addInputPath(job, paths.getPath(DataPaths.DUMMY), NLineInputFormat.class, CreateRandomAccessMapper.class); MultipleInputs.addInputPath(job, paths.getPath(DataPaths.LINKS), TextInputFormat.class, TagRecordsMapper.class); job.setCombinerClass(CreateUserVisitsCombiner.class); job.setReducerClass(CreateUserVisitsReducer.class); if (options.reds > 0) { job.setNumReduceTasks(options.reds); } else { job.setNumReduceTasks(DataOptions.getMaxNumReduce()); } // job.setNumReduceTasks(options.agents/2); if (options.SEQUENCE_OUT) { job.setOutputFormat(SequenceFileOutputFormat.class); } else { job.setOutputFormat(TextOutputFormat.class); } if (null != options.codecClass) { job.set("mapred.output.compression.type", "BLOCK"); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, options.codecClass); } FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.USERVISITS)); LOG.info("Running Job: " + jobname); LOG.info("Dummy file " + paths.getPath(DataPaths.DUMMY) + " as input"); LOG.info("Links file " + paths.getResult(DataPaths.LINKS) + " as output"); LOG.info("Ouput file " + paths.getResult(DataPaths.USERVISITS)); JobClient.runJob(job); LOG.info("Finished Running Job: " + jobname); LOG.info("Cleaning temp files..."); paths.cleanTempFiles(paths.getResult(DataPaths.USERVISITS)); }
From source file:net.team1.dev.HousingAnalysis.java
License:Apache License
/** * The main entry point for the map/reduce runner. * * @param args 2 args: \<input dir\> \<output dir\> * @throws Exception Throws IOException/*from ww w . java 2 s. c o m*/ */ public static void main(String[] args) throws Exception { Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); FileSystem fs = FileSystem.get(new Configuration()); if (!fs.exists(inputDir)) throw new IOException("The input path does not exist."); if (fs.isFile(inputDir)) throw new IOException("The input path is a file."); if (fs.exists(outputDir)) fs.delete(outputDir, true); // set job configuration JobConf conf = new JobConf(HousingAnalysis.class); conf.setJobName("housinganalysis"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); conf.setCombinerClass(HousingReducer.class); conf.setReducerClass(HousingReducer.class); // set multiple input files HashMap<Path, Class<? extends Mapper>> inputMappers = getInputFilePaths(inputDir, fs); for (Path p : inputMappers.keySet()) { MultipleInputs.addInputPath(conf, p, TextInputFormat.class, inputMappers.get(p)); LOG.info(p.getName() + ": " + inputMappers.get(p).getName()); } // set output FileOutputFormat.setOutputPath(conf, outputDir); // start the job JobClient.runJob(conf); }
From source file:org.acacia.csr.java.CSRConverter.java
License:Apache License
public static void main(String[] args) throws Exception { if (!validArgs(args)) { printUsage();/* w ww .j a v a 2 s . co m*/ return; } //These are the temp paths that are created on HDFS String dir1 = "/user/miyuru/csrconverter-output"; String dir2 = "/user/miyuru/csrconverter-output-sorted"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); System.out.println("Deleting the dir : " + dir1); if (fs1.exists(new Path(dir1))) { fs1.delete(new Path(dir1), true); } System.out.println("Done deleting the dir : " + dir1); System.out.println("Deleting the dir : " + dir2); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } Path notinPath = new Path("/user/miyuru/notinverts/notinverts"); if (!fs1.exists(notinPath)) { fs1.create(notinPath); } System.out.println("Done deleting the dir : " + dir2); //Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why. VertexCounterClient.setDefaultGraphID(args[3], args[2]); //First job creates the inverted index JobConf conf = new JobConf(CSRConverter.class); conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]); conf.set("org.acacia.partitioner.hbase.table", args[2]); conf.set("org.acacia.partitioner.hbase.contacthost", args[3]); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); //conf.setMapperClass(InvertedMapper.class); conf.setReducerClass(InvertedReducer.class); //conf.setInputFormat(TextInputFormat.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); //FileInputFormat.setInputPaths(conf, new Path(args[0])); MultipleInputs.addInputPath(conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class); MultipleInputs.addInputPath(conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class, InvertedMapper.class); FileOutputFormat.setOutputPath(conf, new Path(dir1)); //Also for the moment we turn-off the speculative execution conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setNumMapTasks(96); conf.setNumReduceTasks(96); conf.setPartitionerClass(VertexPartitioner.class); conf.set("vertex-count", args[4]); conf.set("zero-flag", args[5]); Job job = new Job(conf, "csr_inverter"); job.setSortComparatorClass(SortComparator.class); job.waitForCompletion(true); }