Example usage for org.apache.hadoop.mapred.lib MultipleInputs addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred.lib MultipleInputs addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass,
        Class<? extends Mapper> mapperClass)

Source Link

Document

Add a Path with a custom InputFormat and Mapper to the list of inputs for the map-reduce job.

Usage

From source file:com.datasalt.pangool.benchmark.urlresolution.HadoopUrlResolution.java

License:Apache License

public final static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: urlresolution <url-map> <url-register> <out>");
        System.exit(2);//from   w  w w  . j a va  2  s  . c  om
    }
    JobConf job = new JobConf(conf);
    FileSystem fS = FileSystem.get(conf);
    fS.delete(new Path(otherArgs[2]), true);

    MultipleInputs.addInputPath(job, new Path(otherArgs[0]), TextInputFormat.class, UrlMapClass.class);
    MultipleInputs.addInputPath(job, new Path(otherArgs[1]), TextInputFormat.class, UrlRegisterMapClass.class);

    job.setJarByClass(HadoopUrlResolution.class);

    job.setPartitionerClass(KeyPartitioner.class);
    job.setOutputValueGroupingComparator(GroupingComparator.class);

    job.setMapOutputKeyClass(UrlRegJoinUrlMap.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));

    Job j = new Job(job);
    j.setReducerClass(Reduce.class);
    j.waitForCompletion(true);
}

From source file:com.ebay.erl.mobius.core.mapred.MobiusMultiInputs.java

License:Apache License

public static void addInputPath(JobConf conf, Path anInput, Class<? extends InputFormat> inputFormatClass,
        Class<? extends AbstractMobiusMapper> mapperClass, byte datasetID, FileSystem fs) throws IOException {
    MultipleInputs.addInputPath(conf, anInput, inputFormatClass, mapperClass);

    // override the {@link InputFormat} class set by the {@link MultipleInputs}
    // as Mobius need to set the set the current dataset id per input split.
    conf.setInputFormat(MobiusDelegatingInputFormat.class);

    // MobiusDelegatingInputFormat extends DelegatingInputFormat, which always
    // call the FileInpupt#setInputs within DelegatingInputFormat#getInputs
    // regardless of the actual type of <code>inputFormatClass</code>.

    /////////////////////////////////////////////////////
    // start to build the path to dataset ID mapping
    /////////////////////////////////////////////////////
    MultiInputsHelper helper = MultiInputsHelpersRepository.getInstance(conf).getHelper(inputFormatClass);
    URI uri = helper.getUniquePathByInputFormat(conf, anInput);
    String aPath = uri.toString();

    if (aPath.indexOf(";") >= 0)
        throw new IllegalArgumentException(aPath + " cannot contains semicolon");

    // set the input path to datasetID mapping in the Hadoop configuration.
    if (conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "").isEmpty()) {
        conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath);
    } else {/* w  w  w  . j av  a 2s  .c  o m*/
        String previous = conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING);
        conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath + "," + previous);
    }

    //LOGGER.debug(conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, ""));
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (!createJobConfAndParseArgs(args)) {
        return 0;
    }/*w ww .  j  a  v  a2  s.co m*/

    setFileSystem(FileSystem.get(job));

    FileStatus status = fs.getFileStatus(srcDir);

    if (null == status || !status.isDir()) {
        throw new IllegalArgumentException("No such directory: " + srcDir);
    }

    if (Mode.STAND_ALONE == mode) {
        standAlone();
    } else {
        writeDirs();

        MultipleInputs.addInputPath(job, bucketFiles, SequenceFileInputFormat.class, IdentityMapper.class);
        MultipleInputs.addInputPath(job, counters, CountersInputFormat.class, CountersMapper.class);

        job.setPartitionerClass(CrushPartitioner.class);

        job.setReducerClass(CrushReducer.class);

        job.setOutputKeyComparatorClass(Text.Comparator.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setOutputFormat(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(job, bucketFiles);
        FileOutputFormat.setOutputPath(job, outDir);

        job.set("crush.partition.map", partitionMap.toString());

        if (0 != nBuckets) {
            print(Verbosity.INFO, "\n\nInvoking map reduce\n\n");

            RunningJob completed = JobClient.runJob(job);

            jobCounters = completed.getCounters();
        }

        long eligible = jobCounters.getCounter(MapperCounter.FILES_ELIGIBLE);
        long crushed = jobCounters.getCounter(ReducerCounter.FILES_CRUSHED);

        /*
         * There's no way this cannot hold true if Hadoop is working correctly.
         */
        if (eligible != crushed) {
            throw new AssertionError(format("Files eligible (%d) != files crushed (%d)", eligible, crushed));
        }

        if (Mode.CLONE == mode) {
            cloneOutput();
        } else {
            moveOutput();
        }
    }

    print(Verbosity.INFO, "\n\nDeleting temporary directory");

    fs.delete(tmpDir, true);

    /*
     * If we have printed anything to the console at all, then add a line wrap to bring the cursor back to the beginning.
     */
    print(Verbosity.INFO, "\n\n");

    return 0;
}

From source file:findstableweatherstate.FindStableWeatherState.java

public String call() throws Exception {

    Path firstOutputPath = new Path("input/firstOutput");
    Path secondOutputPath = new Path("input/secondOutput");

    long startTime, stopTime, elapsedTime;

    JobConf job = new JobConf();
    job.setJarByClass(getClass());//  w w  w . ja v a2s .  c o  m
    job.setJobName("invertedindex");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setReducerClass(JoinReducer.class);

    MultipleInputs.addInputPath(job, new Path(getInputPathStation()), TextInputFormat.class,
            StationMapper.class);
    MultipleInputs.addInputPath(job, new Path(getInputPathReadings()), TextInputFormat.class,
            ReadingsMapper.class);

    FileOutputFormat.setOutputPath(job, firstOutputPath);

    JobConf job2 = new JobConf();
    job2.setJarByClass(getClass());
    job2.setJobName("secondJob");

    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);

    //job2.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class);
    FileInputFormat.setInputPaths(job2, firstOutputPath);

    job2.setMapperClass(CalculateMinMaxTemperatureMapper.class);

    job2.setReducerClass(CalculateMaxMinTemperatureReducer.class);
    if (getOutputPath() != null) {
        FileOutputFormat.setOutputPath(job2, secondOutputPath);
    }

    JobConf job3 = new JobConf();
    job3.setJarByClass(getClass());
    job3.setJobName("thirdJob");

    job3.setOutputKeyClass(Text.class);
    job3.setOutputValueClass(Text.class);
    job3.setMapOutputKeyClass(DoubleWritable.class);
    job3.setMapOutputValueClass(Text.class);
    //job2.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class);
    FileInputFormat.setInputPaths(job3, secondOutputPath);

    job3.setMapperClass(SortStateMapper.class);

    job3.setReducerClass(SortStateReducer.class);
    if (getOutputPath() != null) {
        FileOutputFormat.setOutputPath(job3, new Path(getOutputPath()));
    }

    startTime = System.currentTimeMillis();

    JobClient.runJob(job);

    stopTime = System.currentTimeMillis();
    elapsedTime = stopTime - startTime;
    System.out.println("******************** First Job : " + elapsedTime / 1000);

    startTime = System.currentTimeMillis();

    JobClient.runJob(job2);

    stopTime = System.currentTimeMillis();
    elapsedTime = stopTime - startTime;
    System.out.println("******************** Second Job : " + elapsedTime / 1000);

    startTime = System.currentTimeMillis();

    JobClient.runJob(job3);

    stopTime = System.currentTimeMillis();
    elapsedTime = stopTime - startTime;
    System.out.println("******************** Third Job : " + elapsedTime / 1000);

    return "";
}

From source file:fm.last.hadoop.programs.labs.trackstats.TrackStatisticsProgram.java

License:Apache License

/**
 * Creates a JobConf for a Job that will merge the unique listeners and track statistics.
 * //from ww w .  ja  v a  2s .co m
 * @param outputPath The path for the results to be output to.
 * @param sumInputDir The path containing the data from the sum Job.
 * @param listenersInputDir The path containing the data from the unique listeners job.
 * @return The merge JobConf.
 */
private JobConf getMergeConf(Path outputPath, Path sumInputDir, Path listenersInputDir) {
    log.info("Creating configuration for merge job");
    JobConf conf = new JobConf(TrackStatisticsProgram.class);
    conf.setOutputKeyClass(IntWritable.class); // track id
    conf.setOutputValueClass(TrackStats.class); // overall track statistics
    conf.setCombinerClass(SumReducer.class); // safe to re-use reducer as a combiner here
    conf.setReducerClass(SumReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(conf, outputPath);

    MultipleInputs.addInputPath(conf, sumInputDir, SequenceFileInputFormat.class, IdentityMapper.class);
    MultipleInputs.addInputPath(conf, listenersInputDir, SequenceFileInputFormat.class,
            MergeListenersMapper.class);
    conf.setJobName("merge");
    return conf;
}

From source file:hibench.DataGenerator.java

License:Apache License

public void replaceIds(Path fcontent, Path fids, Path fjoin, ZipfRandom zipf) throws IOException {

    LOG.info("Replace Virtual Zipfian Ids with real Ids...");

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = fcontent.getName() + " JOIN " + fids.getName() + " -> " + fjoin.getName();

    job.setJobName(jobname);//from ww w  . ja  v a 2  s  . c  o  m

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    MultipleInputs.addInputPath(job, fids, TextInputFormat.class, TagRecordsMapper.class);
    MultipleInputs.addInputPath(job, fcontent, TextInputFormat.class, ReverseContentMapper.class);
    job.setOutputFormat(TextOutputFormat.class);

    // use combiner to avoid too many inputs for reducer
    job.setCombinerClass(ConcatTextCombiner.class);
    job.setReducerClass(JoinContentWithZipfReducer.class);

    if (zipf.reds > 0) {
        job.setNumReduceTasks(zipf.reds);
    } else {
        job.setNumReduceTasks(DataOptions.getMaxNumReduce());
    }

    FileOutputFormat.setOutputPath(job, fjoin);

    LOG.info("Running Job: " + jobname);
    LOG.info("Zipfian Id distribution: " + fids);
    LOG.info("Content file with virtual Ids: " + fcontent);
    LOG.info("Joint result file: " + fjoin);
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);
}

From source file:hibench.HiveDataGenerator.java

License:Apache License

private void createRankingsTable() throws IOException {

    LOG.info("Creating table rankings...");

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " rankings";

    job.setJobName(jobname);/*from  w w  w. j  av a 2s . c  o m*/

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(Text.class);

    job.setCombinerClass(ConcatTextCombiner.class);
    job.setReducerClass(CountRankingAndReplaceIdReducer.class);

    if (options.reds > 0) {
        job.setNumReduceTasks(options.reds);
    } else {
        job.setNumReduceTasks(DataOptions.getMaxNumReduce());
    }

    //      job.setNumReduceTasks(options.agents/2);

    /***
     * need to join result with LINK table so that to replace
     * url ids with real contents
     */
    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.T_LINK_PAGE), TextInputFormat.class,
            MyIdentityMapper.class);
    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.LINKS), TextInputFormat.class,
            TagRecordsMapper.class);

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");

        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.RANKINGS));

    LOG.info("Running Job: " + jobname);
    LOG.info("Table link-page file " + paths.getPath(DataPaths.T_LINK_PAGE) + " as input");
    LOG.info("Links file " + paths.getResult(DataPaths.LINKS) + " as output");
    LOG.info("Ouput file " + paths.getResult(DataPaths.RANKINGS));
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.RANKINGS));
}

From source file:hibench.HiveDataGenerator.java

License:Apache License

private void createUserVisitsTable() throws IOException, URISyntaxException {

    LOG.info("Creating user visits...");

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " uservisits";
    job.setJobName(jobname);/*from  w w  w. j  a va  2  s .  c  o m*/

    /***
     * Set distributed cache file for table generation,
     * cache files include:
     * 1. user agents
     * 2. country code and language code
     * 3. search keys
     */

    DistributedCache.addCacheFile(paths.getPath(DataPaths.uagentf).toUri(), job);
    DistributedCache.addCacheFile(paths.getPath(DataPaths.countryf).toUri(), job);
    DistributedCache.addCacheFile(paths.getPath(DataPaths.searchkeyf).toUri(), job);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(Text.class);

    visit.setJobConf(job);

    job.setInputFormat(TextInputFormat.class);

    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.DUMMY), NLineInputFormat.class,
            CreateRandomAccessMapper.class);
    MultipleInputs.addInputPath(job, paths.getPath(DataPaths.LINKS), TextInputFormat.class,
            TagRecordsMapper.class);

    job.setCombinerClass(CreateUserVisitsCombiner.class);
    job.setReducerClass(CreateUserVisitsReducer.class);

    if (options.reds > 0) {
        job.setNumReduceTasks(options.reds);
    } else {
        job.setNumReduceTasks(DataOptions.getMaxNumReduce());
    }

    //      job.setNumReduceTasks(options.agents/2);

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.USERVISITS));

    LOG.info("Running Job: " + jobname);
    LOG.info("Dummy file " + paths.getPath(DataPaths.DUMMY) + " as input");
    LOG.info("Links file " + paths.getResult(DataPaths.LINKS) + " as output");
    LOG.info("Ouput file " + paths.getResult(DataPaths.USERVISITS));
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.USERVISITS));
}

From source file:net.team1.dev.HousingAnalysis.java

License:Apache License

/**
 * The main entry point for the map/reduce runner.
 *
 * @param args 2 args: \<input dir\> \<output dir\>
 * @throws Exception Throws IOException/*from   ww  w  . java  2  s.  c o m*/
 */
public static void main(String[] args) throws Exception {
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    FileSystem fs = FileSystem.get(new Configuration());

    if (!fs.exists(inputDir))
        throw new IOException("The input path does not exist.");
    if (fs.isFile(inputDir))
        throw new IOException("The input path is a file.");
    if (fs.exists(outputDir))
        fs.delete(outputDir, true);

    // set job configuration
    JobConf conf = new JobConf(HousingAnalysis.class);
    conf.setJobName("housinganalysis");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setCombinerClass(HousingReducer.class);
    conf.setReducerClass(HousingReducer.class);

    // set multiple input files
    HashMap<Path, Class<? extends Mapper>> inputMappers = getInputFilePaths(inputDir, fs);
    for (Path p : inputMappers.keySet()) {
        MultipleInputs.addInputPath(conf, p, TextInputFormat.class, inputMappers.get(p));
        LOG.info(p.getName() + ": " + inputMappers.get(p).getName());
    }

    // set output
    FileOutputFormat.setOutputPath(conf, outputDir);

    // start the job
    JobClient.runJob(conf);
}

From source file:org.acacia.csr.java.CSRConverter.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (!validArgs(args)) {
        printUsage();/* w ww .j  a  v  a  2 s  . co  m*/
        return;
    }
    //These are the temp paths that are created on HDFS
    String dir1 = "/user/miyuru/csrconverter-output";
    String dir2 = "/user/miyuru/csrconverter-output-sorted";

    //We first delete the temporary directories if they exist on the HDFS
    FileSystem fs1 = FileSystem.get(new JobConf());

    System.out.println("Deleting the dir : " + dir1);

    if (fs1.exists(new Path(dir1))) {
        fs1.delete(new Path(dir1), true);
    }

    System.out.println("Done deleting the dir : " + dir1);
    System.out.println("Deleting the dir : " + dir2);
    if (fs1.exists(new Path(dir2))) {
        fs1.delete(new Path(dir2), true);
    }

    Path notinPath = new Path("/user/miyuru/notinverts/notinverts");

    if (!fs1.exists(notinPath)) {
        fs1.create(notinPath);
    }

    System.out.println("Done deleting the dir : " + dir2);

    //Note on Aug 23 2014: Sometimes after this the mapReduce job hangs. need to see why.

    VertexCounterClient.setDefaultGraphID(args[3], args[2]);

    //First job creates the inverted index

    JobConf conf = new JobConf(CSRConverter.class);
    conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[1]);
    conf.set("org.acacia.partitioner.hbase.table", args[2]);
    conf.set("org.acacia.partitioner.hbase.contacthost", args[3]);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);
    //conf.setMapperClass(InvertedMapper.class);
    conf.setReducerClass(InvertedReducer.class);
    //conf.setInputFormat(TextInputFormat.class);
    conf.setInputFormat(NLinesInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //FileInputFormat.setInputPaths(conf, new Path(args[0]));
    MultipleInputs.addInputPath(conf, new Path(args[0]), NLinesInputFormat.class, InvertedMapper.class);
    MultipleInputs.addInputPath(conf, new Path("/user/miyuru/notinverts/notinverts"), TextInputFormat.class,
            InvertedMapper.class);
    FileOutputFormat.setOutputPath(conf, new Path(dir1));

    //Also for the moment we turn-off the speculative execution
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setNumMapTasks(96);
    conf.setNumReduceTasks(96);
    conf.setPartitionerClass(VertexPartitioner.class);
    conf.set("vertex-count", args[4]);
    conf.set("zero-flag", args[5]);
    Job job = new Job(conf, "csr_inverter");
    job.setSortComparatorClass(SortComparator.class);
    job.waitForCompletion(true);
}