Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat setInputDirRecursive

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat setInputDirRecursive.

Prototype

public static void setInputDirRecursive(Job job, boolean inputDirRecursive)

Source Link

Usage

From source file:ca.uwaterloo.iss4e.hadoop.meterperfile.ThreelMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.meterperfile.ThreelMain <input> <output>");
        System.exit(2);/*from  w w w. j  a v a  2  s. c o  m*/
    }

    conf.set("mapreduce.input.fileinputformat.split.maxsize", "100");
    Job job = new Job(conf, "ThreelMain");
    job.setJarByClass(ThreelMain.class);

    job.setInputFormatClass(UnsplitableTextInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setNumReduceTasks(0);
    // job.setOutputKeyClass(LongWritable.class);
    //job.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ConsineMain <input> <output>");
        System.exit(2);/*from w  ww  . j a  v a2  s.  c o m*/
    }
    Job job1 = new Job(conf, "ConsineMain");
    job1.setJarByClass(CosineMain.class);

    job1.setMapperClass(AggregateReadingsMapper.class);
    job1.setMapOutputKeyClass(LongWritable.class);
    job1.setMapOutputValueClass(DoubleWritable.class);

    job1.setReducerClass(AggregateReadingsReducer.class);
    job1.setOutputKeyClass(LongWritable.class);
    job1.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job1, true);
    FileInputFormat.setInputPaths(job1, new Path(otherArgs[0]));
    int lastIdx = otherArgs[0].lastIndexOf("/");
    String tempOutput = otherArgs[0].substring(0, lastIdx) + "/temp";
    FileOutputFormat.setOutputPath(job1, new Path(tempOutput));

    System.out.println("\nStarting Job-1 ...");
    final long startTime = System.currentTimeMillis();
    try {
        final long startTimeJob1 = System.currentTimeMillis();
        if (!job1.waitForCompletion(true)) {
            System.out.println("Job-1 failed.");
        } else {
            System.out.println("Duration of Job1 " + ((System.currentTimeMillis() - startTimeJob1) / 1000.0)
                    + " seconds.");
            final Job job2 = new Job(conf, "ConsineMain Aggregate");
            job2.setJarByClass(CosineMain.class);
            job2.setInputFormatClass(CartesianInputFormat.class);
            CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, tempOutput);
            CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, tempOutput);
            FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1]));

            job2.setMapperClass(CartesianProductMapper.class);
            job2.setMapOutputKeyClass(DoubleWritable.class);
            job2.setMapOutputValueClass(Text.class);

            job2.setSortComparatorClass(DescendingKeyComparator.class);

            job2.setReducerClass(CartesianProductReducer.class);
            job2.setOutputKeyClass(Text.class);
            job2.setOutputValueClass(DoubleWritable.class);

            job2.setNumReduceTasks(10);
            final long startTimeJob2 = System.currentTimeMillis();
            System.out.println("\nStarting Job-2 ...");
            if (!job2.waitForCompletion(true)) {
                System.out.println("Job-2 failed.");
            } else {
                System.out.println("Duration of Job2: "
                        + ((System.currentTimeMillis() - startTimeJob2) / 1000.0) + " seconds.");
            }

        }
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(tempOutput), true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Total Duration: " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.HistogramMain <input> <output>");
        System.exit(2);//from   www  . jav  a2s .  co  m
    }
    Job job = new Job(conf, "HistogramMain");
    job.setJarByClass(HistogramMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.PARMain <input> <output>");
        System.exit(2);//  ww  w . j  a  v a2  s  .c o m
    }
    Job job = new Job(conf, "PARMain");
    job.setJarByClass(PARMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ThreelMain <input> <output>");
        System.exit(2);/*from  w ww .j  av a 2  s . co  m*/
    }

    Job job = new Job(conf, "ThreelMain");
    job.setJarByClass(ThreelMain.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(ArrayPrimitiveWritable.class);

    job.setCombinerClass(MyCombiner.class);

    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.out.println("\nStarting Job ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            System.out.println("Job failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:com.ifeng.logparser.NginxLogDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getSimpleName());
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }//from www  .j  a va  2s .  c om

    Job job = Job.getInstance(super.getConf());
    FileInputFormat.setInputDirRecursive(job, true);

    //FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    FileInputFormat.addInputPaths(job, args[0]);

    job.setMapperClass(NginxLogMapper.class);
    job.setReducerClass(NginxLogReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_path(s)> <output_path> <table_name>");
        return -1;
    }/*from   ww  w. j  ava 2 s .co  m*/
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
            RDFFormat.class, RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    Map<String, Integer> contextSplitsMap = new HashMap<>();
    for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) {
        int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
        StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
        while (stk.hasMoreTokens()) {
            contextSplitsMap.put(stk.nextToken(), splits);
        }
    }
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
            getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.msd.gin.halyard.tools.HalyardHiveLoad.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: hiveload -D" + RDF_MIME_TYPE_PROPERTY + "='application/ld+json' [-D"
                + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + HIVE_DATA_COLUMN_INDEX_PROPERTY + "=3] [-D"
                + BASE_URI_PROPERTY + "='http://my_base_uri/'] [-D" + HalyardBulkLoad.SPLIT_BITS_PROPERTY
                + "=8] [-D" + HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D"
                + HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY
                + "=true] <hive_table_name> <output_path> <hbase_table_name>");
        return -1;
    }/*from   w w  w. j a  va2 s  . co m*/
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
            RDFFormat.class, RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    Job job = Job.getInstance(getConf(), "HalyardHiveLoad -> " + args[1] + " -> " + args[2]);
    int i = args[0].indexOf('.');
    HCatInputFormat.setInput(job, i > 0 ? args[0].substring(0, i) : null, args[0].substring(i + 1));
    job.setJarByClass(HalyardHiveLoad.class);
    job.setMapperClass(HiveMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(HCatInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    Map<String, Integer> contextSplitsMap = new HashMap<>();
    for (Map.Entry<String, String> me : getConf().getValByRegex(HalyardBulkLoad.CONTEXT_SPLIT_REGEXP)
            .entrySet()) {
        int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
        StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
        while (stk.hasMoreTokens()) {
            contextSplitsMap.put(stk.nextToken(), splits);
        }
    }
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
            getConf().getInt(HalyardBulkLoad.SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.ov.project.dev.crawler.ClientOVSocket.java

public Map<VelibStation, Prediction> jobToDo(Map<VelibKey, Integer> iDataProvided) {

    System.setProperty("hadoop.home.dir", BundelUtils.get("hadoop.home"));

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext lSparkctx = SingletonWrappers.sparkContextGetInstance();

    Job lJob;/* w  ww.  j a v a 2 s  . co  m*/

    try {
        lJob = Job.getInstance();
        FileInputFormat.setInputPaths(lJob, new Path(BundelUtils.get("data.frame.path")));
        FileInputFormat.setInputDirRecursive(lJob, true);
    } catch (IOException e1) {
        e1.printStackTrace();
        System.exit(1);
    }

    Prediction predi = new Prediction();

    //   com.ov.PredictionsBuilder.runPredictions(BundelUtils.get("bruteData.path"), BundelUtils.get("static.path"), BundelUtils.get("output.path"), BundelUtils.get("model.path"), BundelUtils.get("hadoop.home"), iSave, Calendar.MINUTE, new TimeStamp(), new TimeStamp(), BundelUtils.get("license.path"));

    //   JavaRDD<Text> sourceData = lSparkctx
    //         .newAPIHadoopRDD(lJob.getConfiguration(), TextInputFormat.class, LongWritable.class, Text.class)
    //         .values();

    // Each line will be translate to a session defined by the IP adress
    //   JavaPairRDD<VelibStation, Prediction> lsession = sourceData
    //         .mapToPair(
    //               w -> new Tuple2<VelibStation, Prediction>(LogParser.getFirstToken(w), LogParser.parseTokenz(w)))
    //         .reduceByKey((a, b) -> reduceByIP(a, b));

    // Save the word count back out to a text file, causing evaluation.
    //      FileUtils.deleteQuietly(new File(BundelUtils.get("suffix.for.result.file")));
    //      lsession.saveAsTextFile(BundelUtils.get("suffix.for.result.file"));

    return null;
}

From source file:com.rw.legion.DefaultJob.java

License:Apache License

/**
 * Main method./*w  w  w .ja  va  2s .co  m*/
 * 
 * @param args  Arguments should be: 1) input path, 2) output path, 3)
 * location of Legion objective file.
 */
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    // Load the Legion objective from the JSON doc.
    Path path = new Path(args[2]);
    FileSystem fs = FileSystem.get(new URI(args[2]), conf);
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String json = "";

    String line = br.readLine();

    while (line != null) {
        json += line;
        line = br.readLine();
    }

    br.close();

    /*
     *  Save the JSON for the Legion objective to the Hadoop configuration,
     *  so we can access it in other containers.
     */
    conf.setStrings("legion_objective", json);

    // De-serialize the objective so we can access the settings here.
    LegionObjective legionObjective = ObjectiveDeserializer.deserialize(json);

    // Start configuring the MapReduce job.
    Job hadoopJob = Job.getInstance(conf, "Legion");

    hadoopJob.setJarByClass(DefaultJob.class);
    hadoopJob.setMapperClass(DefaultMapper.class);
    LazyOutputFormat.setOutputFormatClass(hadoopJob, TextOutputFormat.class);

    // Compress the output to speed things up.
    TextOutputFormat.setCompressOutput(hadoopJob, true);
    TextOutputFormat.setOutputCompressorClass(hadoopJob, GzipCodec.class);

    // What input format do we use?

    try {
        @SuppressWarnings("unchecked")
        Class<? extends FileInputFormat<NullWritable, LegionRecord>> inputClass = (Class<? extends FileInputFormat<NullWritable, LegionRecord>>) Class
                .forName(legionObjective.getInputFormat());

        hadoopJob.setInputFormatClass(inputClass);
    } catch (Exception e) {
        throw new JsonParseException(
                "Problem loading input format " + "class '" + legionObjective.getInputFormat() + "'");
    }

    // Should we set a max combined size?

    if (legionObjective.getMaxCombinedSize() != null) {
        CombineFileInputFormat.setMaxInputSplitSize(hadoopJob, legionObjective.getMaxCombinedSize());
    }

    /* 
     * These are just static convenience methods, so it doesn't matter if
     * they come from the wrong class.
     */
    FileInputFormat.setInputDirRecursive(hadoopJob, true);
    FileInputFormat.addInputPath(hadoopJob, new Path(args[0]));

    FileOutputFormat.setOutputPath(hadoopJob, new Path(args[1]));

    // Since a Legion objective can specify multiple output tables.
    for (OutputTable outputTable : legionObjective.getOutputTables()) {
        MultipleOutputs.addNamedOutput(hadoopJob, outputTable.getTitle(), TextOutputFormat.class,
                NullWritable.class, Text.class);
    }

    MultipleOutputs.addNamedOutput(hadoopJob, "skipped", TextOutputFormat.class, NullWritable.class,
            Text.class);

    hadoopJob.waitForCompletion(true);
}