Example usage for org.apache.hadoop.mapred JobConf setInputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInputFormat.

Prototype

public void setInputFormat(Class<? extends InputFormat> theClass) 

Source Link

Document

Set the InputFormat implementation for the map-reduce job.

Usage

From source file:hibench.PageRankDataGenerator.java

License:Apache License

private void createPageRankNodes() throws IOException {

    LOG.info("Creating PageRank nodes...", null);

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " pagerank nodes";

    job.setJobName(jobname);/*  w ww  .  j a  v a2s  .c o  m*/

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, paths.getPath(DataPaths.LINKS));
    job.setInputFormat(TextInputFormat.class);

    if (options.PAGERANK_NODE_BALANCE) {
        /***
         * Balance the output order of nodes, to prevent the running
         * of pagerank bench from potential data skew
         */
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setMapperClass(BalancedLinkNodesMapper.class);
        job.setReducerClass(BalancedLinkNodesReducer.class);
        //         job.setPartitionerClass(ModulusPartitioner.class);

        if (options.reds > 0) {
            job.setNumReduceTasks(options.reds);
        } else {
            job.setNumReduceTasks(DataOptions.getMaxNumReduce());
        }
    } else {
        job.setMapOutputKeyClass(Text.class);

        job.setMapperClass(OutputLinkNodesMapper.class);
        job.setNumReduceTasks(0);
    }

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.VERTICALS));

    LOG.info("Running Job: " + jobname);
    LOG.info("Links file " + paths.getPath(DataPaths.LINKS) + " as input");
    LOG.info("Vertices file " + paths.getResult(DataPaths.VERTICALS) + " as output");
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.VERTICALS));
}

From source file:hibench.PageRankDataGenerator.java

License:Apache License

/***
 * Create pagerank edge table, output link A->B as <A, B> pairs
 * @throws IOException//  www . j a v  a2s  . c o m
 */
private void createPageRankLinks() throws IOException {

    LOG.info("Creating PageRank links", null);

    JobConf job = new JobConf(WebDataGen.class);
    String jobname = "Create " + paths.dname + " pagerank links";

    job.setJobName(jobname);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setMapOutputKeyClass(Text.class);

    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, paths.getPath(DataPaths.T_LINK_PAGE));
    job.setInputFormat(TextInputFormat.class);

    job.setMapperClass(OutputLinkEdgesMapper.class);

    if (options.SEQUENCE_OUT) {
        job.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        job.setOutputFormat(TextOutputFormat.class);
    }

    if (null != options.codecClass) {
        job.set("mapred.output.compression.type", "BLOCK");
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, options.codecClass);
    }

    FileOutputFormat.setOutputPath(job, paths.getResult(DataPaths.EDGES));

    LOG.info("Running Job: " + jobname);
    LOG.info("Table link-page " + paths.getPath(DataPaths.T_LINK_PAGE) + " as input");
    LOG.info("Edges file " + paths.getResult(DataPaths.EDGES) + " as output");
    JobClient.runJob(job);
    LOG.info("Finished Running Job: " + jobname);

    LOG.info("Cleaning temp files...");
    paths.cleanTempFiles(paths.getResult(DataPaths.EDGES));
}

From source file:hitune.analysis.mapreduce.processor.HadoopMetrics.java

License:Apache License

@Override
public void run() {
    // TODO Auto-generated method stub
    long timestamp = System.currentTimeMillis();
    JobConf conf = new JobConf(this.conf, HadoopMetrics.class);
    try {//  w w  w.  j  ava  2  s.  c  o m
        conf.setJobName(this.getClass().getSimpleName() + timestamp);
        conf.setInputFormat(MultiSequenceFileInputFormat.class);
        conf.setMapperClass(HadoopMetrics.MapClass.class);
        conf.setReducerClass(SystemLog.ReduceClass.class);
        Class<? extends WritableComparable> outputKeyClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass))
                .asSubclass(WritableComparable.class);
        Class<? extends Writable> outputValueClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass))
                .asSubclass(Writable.class);
        conf.setMapOutputKeyClass(outputKeyClass);
        conf.setMapOutputValueClass(outputValueClass);

        conf.setOutputKeyClass(Text.class);

        conf.setOutputValueClass(TextArrayWritable.class);
        conf.setOutputFormat(CSVFileOutputFormat.class);

        String outputPaths = conf.get(AnalysisProcessorConfiguration.reportfolder) + "/"
                + conf.get(AnalysisProcessorConfiguration.reportfile);
        String temp_outputPaths = getTempOutputDir(outputPaths);

        if (this.inputfiles != null) {
            log.debug("inputPaths:" + inputfiles);
            FileInputFormat.setInputPaths(conf, inputfiles);
            FileOutputFormat.setOutputPath(conf, new Path(temp_outputPaths));
            try {
                JobClient.runJob(conf);
                moveResults(conf, outputPaths, temp_outputPaths);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                log.warn("For " + getOutputFileName() + " :JOB fails!");
                log.warn(e);
                e.printStackTrace();
                this.MOVE_DONE = false;
            }

        } else {
            log.warn("For " + getOutputFileName() + " :No input path!");
        }

    } catch (Exception e) {

        log.warn("Job preparation failure!");
        log.warn(e);
        e.printStackTrace();
    }

}

From source file:hitune.analysis.mapreduce.processor.HistoryLog.java

License:Apache License

public void run() {
    // TODO Auto-generated method stub
    long timestamp = System.currentTimeMillis();
    JobConf conf = new JobConf(this.conf, HistoryLog.class);
    try {//from  w ww . j  a  v a  2  s  . co  m
        conf.setJobName(this.getClass().getSimpleName() + timestamp);
        conf.setInputFormat(MultiSequenceFileInputFormat.class);
        conf.setMapperClass(HistoryLog.MapClass.class);
        conf.setReducerClass(HistoryLog.ReduceClass.class);
        conf.setOutputKeyClass(Text.class);

        Class<? extends WritableComparable> outputKeyClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass))
                .asSubclass(WritableComparable.class);
        Class<? extends Writable> outputValueClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass))
                .asSubclass(Writable.class);
        conf.setMapOutputKeyClass(outputKeyClass);
        conf.setMapOutputValueClass(outputValueClass);

        conf.setOutputValueClass(TextArrayWritable.class);
        conf.setOutputFormat(CSVFileOutputFormat.class);

        String outputPaths = conf.get(AnalysisProcessorConfiguration.reportfolder) + "/"
                + conf.get(AnalysisProcessorConfiguration.reportfile);
        String temp_outputPaths = getTempOutputDir(outputPaths);

        if (this.inputfiles != null) {
            log.debug("inputPaths:" + inputfiles);
            FileInputFormat.setInputPaths(conf, inputfiles);
            FileOutputFormat.setOutputPath(conf, new Path(temp_outputPaths));

            try {
                JobClient.runJob(conf);
                moveResults(conf, outputPaths, temp_outputPaths);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                log.warn("For " + getOutputFileName() + " :JOB fails!");
                log.warn(e);
                e.printStackTrace();
                this.MOVE_DONE = false;
            }

        } else {
            log.warn("For " + getOutputFileName() + " :No input path!");
        }
    } catch (Exception e) {
        log.warn("Job preparation failure!");
        log.warn(e);
        e.printStackTrace();
    }
}

From source file:hitune.analysis.mapreduce.processor.InstrumentDataflow.java

License:Apache License

@Override
public void run() {
    // TODO Auto-generated method stub

    long timestamp = System.currentTimeMillis();

    JobConf conf = new JobConf(this.conf, InstrumentDataflow.class);
    try {//from  w ww  .  j a  v a 2 s . c o  m
        conf.setJobName(this.getClass().getSimpleName() + timestamp);
        conf.setInputFormat(MultiSequenceFileInputFormat.class);
        conf.setMapperClass(InstrumentDataflow.MapClass.class);
        conf.setReducerClass(InstrumentDataflow.ReduceClass.class);
        conf.setOutputKeyClass(Text.class);
        Class<? extends WritableComparable> outputKeyClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass))
                .asSubclass(WritableComparable.class);
        Class<? extends Writable> outputValueClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass))
                .asSubclass(Writable.class);
        conf.setMapOutputKeyClass(outputKeyClass);
        conf.setMapOutputValueClass(outputValueClass);

        conf.setOutputValueClass(TextArrayWritable.class);
        conf.setOutputFormat(CSVFileOutputFormat.class);

        String outputPaths = conf.get(AnalysisProcessorConfiguration.reportfolder) + "/"
                + conf.get(AnalysisProcessorConfiguration.reportfile);
        String temp_outputPaths = getTempOutputDir(outputPaths);

        if (this.inputfiles != null) {
            log.debug("inputPaths:" + inputfiles);
            FileInputFormat.setInputPaths(conf, inputfiles);
            FileOutputFormat.setOutputPath(conf, new Path(temp_outputPaths));

            //FileInputFormat.setInputPathFilter(conf, evtFileFilter.class);
            //conf.setNumReduceTasks(1);

            try {
                JobClient.runJob(conf);
                moveResults(conf, outputPaths, temp_outputPaths);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                log.warn("For " + getOutputFileName() + " :JOB fails!");
                log.warn(e);
                e.printStackTrace();
                this.MOVE_DONE = false;
            }

        } else {
            log.warn("For " + getOutputFileName() + " :No input path!");
        }
    } catch (Exception e) {
        log.warn("Job preparation failure!");
        log.warn(e);
        e.printStackTrace();
    }
}

From source file:hitune.analysis.mapreduce.processor.InstrumentSamplingTop.java

License:Apache License

@Override
public void run() {
    // TODO Auto-generated method stub
    long timestamp = System.currentTimeMillis();
    try {//from w ww .  j a  v a2  s.co m
        JobConf conf = new JobConf(this.conf, InstrumentSamplingTop.class);
        conf.setJobName(this.getClass().getSimpleName() + "_1_" + timestamp);

        conf.setInputFormat(MultiSequenceFileInputFormat.class);
        conf.setMapperClass(InstrumentSamplingTop.MapClass.class);
        conf.setReducerClass(InstrumentSamplingTop.ReduceClass.class);

        Class<? extends WritableComparable> outputKeyClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass))
                .asSubclass(WritableComparable.class);
        Class<? extends Writable> outputValueClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass))
                .asSubclass(Writable.class);
        conf.setMapOutputKeyClass(outputKeyClass);
        conf.setMapOutputValueClass(outputValueClass);
        conf.setOutputKeyClass(outputKeyClass);
        conf.setOutputValueClass(outputValueClass);

        conf.setOutputFormat(SequenceFileOutputFormat.class);

        String outputPaths = conf.get(AnalysisProcessorConfiguration.reportfolder) + "/"
                + conf.get(AnalysisProcessorConfiguration.reportfile);

        String temp_outputPaths = getTempOutputDir(outputPaths);

        if (this.inputfiles != null) {
            log.debug("inputPaths:" + inputfiles);
            FileInputFormat.setInputPaths(conf, inputfiles);
            FileOutputFormat.setOutputPath(conf, new Path(outputPaths + "_1_" + timestamp));

            try {

                //first job
                JobClient.runJob(conf);

                JobConf secondconf = new JobConf(this.conf, InstrumentSamplingTop.class);
                secondconf.setJobName(this.getClass().getSimpleName() + "_2_" + timestamp);
                secondconf.setInputFormat(SequenceFileInputFormat.class);
                secondconf.setMapperClass(IdentityMapper.class);
                secondconf.setReducerClass(InstrumentSamplingTop.TopClass.class);

                secondconf.setMapOutputKeyClass(outputKeyClass);
                secondconf.setMapOutputValueClass(outputValueClass);

                secondconf.setOutputKeyClass(Text.class);
                secondconf.setOutputValueClass(TextArrayWritable.class);
                secondconf.setOutputFormat(CSVFileOutputFormat.class);
                FileInputFormat.setInputPaths(secondconf, outputPaths + "_1_" + timestamp);
                FileOutputFormat.setOutputPath(secondconf, new Path(temp_outputPaths));

                //second job to get ranking list
                JobClient.runJob(secondconf);
                moveResults(secondconf, outputPaths, temp_outputPaths);
                Path temp = new Path(outputPaths + "_1_" + timestamp);
                temp.getFileSystem(conf).delete(temp);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                log.warn("For " + getOutputFileName() + " :JOB fails!");
                log.warn(e);
                e.printStackTrace();
                this.MOVE_DONE = false;
            }

        } else {
            log.warn("For " + getOutputFileName() + " :No input path!");
        }
    } catch (Exception e) {
        log.warn("Job preparation failure!");
        log.warn(e);
        e.printStackTrace();
    }

}

From source file:hitune.analysis.mapreduce.processor.SystemLog.java

License:Apache License

@Override
public void run() {

    // TODO Auto-generated method stub
    long timestamp = System.currentTimeMillis();
    JobConf conf = new JobConf(this.conf, SystemLog.class);

    try {// w  w w  .  j  a va  2s  .  com
        conf.setJobName(this.getClass().getSimpleName() + timestamp);

        conf.setInputFormat(MultiSequenceFileInputFormat.class);
        conf.setMapperClass(SystemLog.MapClass.class);
        conf.setReducerClass(SystemLog.ReduceClass.class);

        Class<? extends WritableComparable> outputKeyClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputKeyClass))
                .asSubclass(WritableComparable.class);
        Class<? extends Writable> outputValueClass = Class
                .forName(conf.get(AnalysisProcessorConfiguration.mapoutputValueClass))
                .asSubclass(Writable.class);
        conf.setMapOutputKeyClass(outputKeyClass);
        conf.setMapOutputValueClass(outputValueClass);

        conf.setOutputKeyClass(Text.class);

        conf.setOutputValueClass(TextArrayWritable.class);
        conf.setOutputFormat(CSVFileOutputFormat.class);

        String outputPaths = conf.get(AnalysisProcessorConfiguration.reportfolder) + "/"
                + conf.get(AnalysisProcessorConfiguration.reportfile);
        String temp_outputPaths = getTempOutputDir(outputPaths);

        if (this.inputfiles != null) {
            log.debug("inputPaths:" + inputfiles);
            FileInputFormat.setInputPaths(conf, inputfiles);
            FileOutputFormat.setOutputPath(conf, new Path(temp_outputPaths));
            try {
                JobClient.runJob(conf);
                moveResults(conf, outputPaths, temp_outputPaths);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                log.warn("For " + getOutputFileName() + " :JOB fails!");
                log.warn(e);
                e.printStackTrace();
                this.MOVE_DONE = false;
            }

        } else {
            log.warn("For " + getOutputFileName() + " :No input path!");

        }

    } catch (Exception e) {
        log.warn("Job preparation failure!");
        log.warn(e);
        e.printStackTrace();
    }

}

From source file:hydrograph.engine.cascading.scheme.avro.CustomAvroScheme.java

License:Apache License

/**
 * sourceConfInit is called by cascading to set up the sources. This happens
 * on the client side before the job is distributed. There is a check for
 * the presence of a schema and if none has been provided the data is peeked
 * at to get a schema. After the schema check the conf object is given the
 * options that Avro needs.//from w  w  w .  ja v a 2 s . c  o m
 *
 * @param flowProcess
 *            The cascading FlowProcess object. Should be passed in by
 *            cascading automatically.
 * @param tap
 *            The cascading Tap object. Should be passed in by cascading
 *            automatically.
 * @param conf
 *            The Hadoop JobConf object. This is passed in by cascading
 *            automatically.
 * @throws RuntimeException
 *             If no schema is present this halts the entire process.
 */
@Override
public void sourceConfInit(FlowProcess<? extends JobConf> flowProcess,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) {

    retrieveSourceFields(flowProcess, tap);
    // Set the input schema and input class
    conf.set(AvroJob.INPUT_SCHEMA, schema.toString());
    conf.setInputFormat(AvroInputFormat.class);

    // add AvroSerialization to io.serializations
    addAvroSerializations(conf);
}

From source file:hydrograph.engine.cascading.scheme.hive.parquet.HiveParquetScheme.java

License:Apache License

@Override
public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf jobConf) {

    jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
    jobConf.set("hive.parquet.timestamp.skip.conversion", "false");
    ParquetInputFormat.setReadSupportClass(jobConf, DataWritableReadSupport.class);
}

From source file:hydrograph.engine.cascading.scheme.parquet.ParquetTupleScheme.java

License:Apache License

@Override
public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf jobConf) {
    jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
    ParquetInputFormat.setReadSupportClass(jobConf, DataWritableReadSupport.class);
    // TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}