Example usage for org.apache.hadoop.mapred JobConf setInputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInputFormat.

Prototype

public void setInputFormat(Class<? extends InputFormat> theClass) 

Source Link

Document

Set the InputFormat implementation for the map-reduce job.

Usage

From source file:edu.stolaf.cs.wmrserver.HadoopEngine.java

License:Apache License

public void submit(JobRequest request, long submissionID, File mapperFile, File reducerFile, File packageDir,
        Path inputPath) throws ValidationException, NotFoundException, CompilationException, InternalException {
    // Generate job output path
    Path outputDir = new Path(_homeDir, "out");
    Path outputPath;//from   w w w. jav a 2 s  . co m
    try {
        FileSystem fs = outputDir.getFileSystem(new Configuration());
        outputPath = JobServiceHandler.getNonexistantPath(outputDir, request.getName(), fs);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not construct output path.", ex);
    }

    JobConf conf = new JobConf();
    conf.setJobName(request.getName());

    // Set mapper and number of tasks if specified
    StreamJob.setStreamMapper(conf, mapperFile.toString());
    if (request.isSetMapTasks())
        conf.setNumMapTasks(request.getMapTasks());

    // Set reducer and number of tasks if specified
    StreamJob.setStreamReducer(conf, reducerFile.toString());
    if (request.isSetReduceTasks())
        conf.setNumReduceTasks(request.getReduceTasks());

    // Create and set job JAR, including necessary files
    ArrayList<String> jarFiles = new ArrayList<String>();
    jarFiles.add(packageDir.toString());
    String jarPath;
    try {
        jarPath = StreamJob.createJobJar(conf, jarFiles, _tempDir);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not create job jar.", ex);
    }
    if (jarPath != null)
        conf.setJar(jarPath);

    // TODO: This is a hack. Rewrite streaming to use DistributedCache.
    //conf.setPattern("mapreduce.job.jar.unpack.pattern",
    //              Pattern.compile(".*"));

    // Set I/O formats and paths
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // Use numeric sort if appropriate
    conf.setBoolean(CONF_NUMERIC, request.isNumericSort());
    if (request.isNumericSort()) {
        conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
        conf.setPartitionerClass(KeyFieldBasedPartitioner.class);
        conf.setKeyFieldComparatorOptions("-n");
        conf.setKeyFieldPartitionerOptions("-n");
    }

    // Set other job information
    conf.set(CONF_USER, request.getUser());
    conf.set(CONF_LANGUAGE, request.getLanguage());
    conf.set(CONF_MAPPER, request.getMapper());
    conf.set(CONF_REDUCER, request.getReducer());

    // Attempt to submit the job

    RunningJob job;
    try {
        JobClient client = new JobClient(new JobConf());
        job = client.submitJob(conf);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("There was a serious error while attempting to submit the job.",
                ex);
    }

    try {
        SubmissionDatabase.setSubmitted(submissionID);
        SubmissionDatabase.setHadoopID(submissionID, job.getID().toString());
    } catch (SQLException ex) {
        throw JobServiceHandler.wrapException("Could not update submission in database.", ex);
    }
}

From source file:edu.ub.ahstfg.indexer.Indexer.java

License:Open Source License

@Override
public int run(String[] arg0) throws Exception {
    LOG.info("Creating Hadoop job for Indexer.");
    JobConf job = new JobConf(getConf());
    job.setJarByClass(Indexer.class);

    LOG.info("Setting input path to '" + INPUT_PATH + "'");
    FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
    // Set filters if it's necessary.

    LOG.info("Clearing the output path at '" + OUTPUT_PATH + "'");
    // Change URI to Path if it's necessary.
    FileSystem fs = FileSystem.get(new URI(OUTPUT_PATH), job);

    if (fs.exists(new Path(OUTPUT_PATH))) {
        fs.delete(new Path(OUTPUT_PATH), true);
    }//from  w w w .  ja  v a  2  s  . c om

    LOG.info("Setting output path to '" + OUTPUT_PATH + "'");
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
    FileOutputFormat.setCompressOutput(job, false);

    LOG.info("Setting input format.");
    job.setInputFormat(ArcInputFormat.class);
    LOG.info("Setting output format.");
    job.setOutputFormat(IndexOutputFormat.class);

    LOG.info("Setting output data types.");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexRecord.class);

    LOG.info("Setting mapper and reducer.");
    job.setMapperClass(IndexerMapper.class);
    job.setMapOutputValueClass(ParsedDocument.class);
    job.setReducerClass(IndexerReducer.class);

    if (JobClient.runJob(job).isSuccessful()) {
        return 0;
    } else {
        return 1;
    }
}

From source file:edu.ub.ahstfg.indexer.wordcount.WordCount.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    LOG.info("Creating Hadoop job for ARC input files word count.");
    JobConf job = new JobConf(getConf());
    job.setJarByClass(WordCount.class);

    LOG.info("Setting input path to '" + inputPath + "'");
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    // Set filters if it's necessary.

    LOG.info("Clearing the output path at '" + outputPath + "'");
    // Change URI to Path if it's necessary.
    FileSystem fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath))) {
        fs.delete(new Path(outputPath), true);
    }/* w w w . ja  v a  2s  . c  o  m*/

    LOG.info("Setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    LOG.info("Setting input format.");
    // job.setInputFormat(TextInputFormat.class);
    job.setInputFormat(ArcInputFormat.class);
    LOG.info("Setting output format.");
    job.setOutputFormat(TextOutputFormat.class);

    LOG.info("Setting output data types.");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    LOG.info("Setting mapper and reducer.");
    // job.setMapperClass(WordCountTextInputMapper.class);
    job.setMapperClass(WordCountArcInputMapper.class);
    job.setReducerClass(LongSumReducer.class);

    if (JobClient.runJob(job).isSuccessful()) {
        return 0;
    } else {
        return 1;
    }
}

From source file:edu.ubc.mirrors.holographs.mapreduce.Driver.java

License:Open Source License

public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf());
    job.setClassLoader(Driver.class.getClassLoader());
    job.setInputFormat(SnapshotObjectsOfTypeInputFormat.class);
    job.setMapperClass(InvokeMethodMapper.class);
    job.setCombinerClass(TextCountSumReducer.class);
    job.setReducerClass(TextCountSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.set("snapshotPath", args[0]);
    job.set("targetClassName", "org.eclipse.cdt.internal.core.dom.parser.cpp.CPPASTName");
    job.setInt("splitSize", 10000);
    job.setInt("maxNumObjects", 100000);

    FileInputFormat.addInputPath(job, new Path(args[0]));

    String outputPath = args[1];//  www.j  av a2s . c  o m
    int suffix = 2;
    while (new File(outputPath).exists()) {
        outputPath = args[1] + suffix++;
    }
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    JobClient.runJob(job);
    return 0;
}

From source file:edu.uci.ics.hyracks.imru.dataflow.Hdtest.java

License:Apache License

public static JobSpecification createJob() throws Exception {
    JobSpecification spec = new JobSpecification();
    spec.setFrameSize(4096);//from   ww  w .jav a2s  . c o m

    String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf";
    String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0";
    JobConf conf = new JobConf();
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
    FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
    conf.setInputFormat(TextInputFormat.class);
    RecordDescriptor recordDesc = new RecordDescriptor(
            new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1);
    HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits,
            new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() {
                @Override
                public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
                    return new IKeyValueParser<LongWritable, Text>() {
                        TupleWriter tupleWriter;

                        @Override
                        public void open(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter = new TupleWriter(ctx, writer, 1);
                        }

                        @Override
                        public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString)
                                throws HyracksDataException {
                            try {
                                tupleWriter.write(value.getBytes(), 0, value.getLength());
                                tupleWriter.finishField();
                                tupleWriter.finishTuple();
                            } catch (IOException e) {
                                throw new HyracksDataException(e);
                            }
                        }

                        @Override
                        public void close(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter.close();
                        }
                    };
                }

            });

    // createPartitionConstraint(spec, readOperator, new String[] {"NC0"});
    PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" });

    IOperatorDescriptor writer = new HDFSOD(spec, null, null, null);
    // createPartitionConstraint(spec, writer, outSplits);

    spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0);

    spec.addRoot(writer);
    return spec;
}

From source file:edu.uci.ics.hyracks.imru.jobgen.IMRUJobFactory.java

License:Apache License

public InputSplit[] getInputSplits() throws IOException {
    JobConf conf = getConf();
    FileInputFormat.setInputPaths(conf, inputPaths);
    conf.setInputFormat(HDFSBlockFormat.class);
    return conf.getInputFormat().getSplits(conf, 1);
}

From source file:edu.uci.ics.hyracks.imru.util.DataBalancer.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(DataBalancer.class);

    job.setJobName(DataBalancer.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setReducerClass(ReduceRecordOnly.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(Integer.parseInt(args[2]));

    if (args.length > 3) {
        if (args[3].startsWith("bzip"))
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        if (args[3].startsWith("gz"))
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }//  w  w  w .j av a 2  s . c o  m
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.core.util.DataGenerator.java

License:Apache License

public static void main(String[] args) throws IOException {

    JobConf job = new JobConf(DataGenerator.class);
    FileSystem dfs = FileSystem.get(job);
    String maxFile = "/maxtemp";
    dfs.delete(new Path(maxFile), true);

    job.setJobName(DataGenerator.class.getSimpleName() + "max ID");
    job.setMapperClass(MapMaxId.class);
    job.setCombinerClass(CombineMaxId.class);
    job.setReducerClass(ReduceMaxId.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(VLongWritable.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(maxFile));
    job.setNumReduceTasks(1);//  w ww  .j ava2 s .c  om
    JobClient.runJob(job);

    job = new JobConf(DataGenerator.class);
    job.set("hyracks.maxid.file", maxFile);
    job.setInt("hyracks.x", Integer.parseInt(args[2]));
    dfs.delete(new Path(args[1]), true);

    job.setJobName(DataGenerator.class.getSimpleName());
    job.setMapperClass(MapRecordGen.class);
    job.setReducerClass(ReduceRecordGen.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(Integer.parseInt(args[3]));

    if (args.length > 4) {
        if (args[4].startsWith("bzip"))
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        if (args[4].startsWith("gz"))
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.example.utils.CommonSource.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(GraphPreProcessor.class);

    job.setJobName(GraphPreProcessor.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setReducerClass(ReduceRecordOnly.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setInputFormat(TextInputFormat.class);
    for (int i = 0; i < args.length - 2; i++) {
        FileInputFormat.addInputPath(job, new Path(args[i]));
    }//from w w w .jav a 2 s .  c o m
    FileOutputFormat.setOutputPath(job, new Path(args[args.length - 2]));
    job.setNumReduceTasks(Integer.parseInt(args[args.length - 1]));
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.example.utils.DuplicateGraph.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(DuplicateGraph.class);

    job.setJobName(DuplicateGraph.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(0);// w w w.j  a  v a  2 s. co m
    JobClient.runJob(job);
}