Example usage for org.apache.hadoop.mapred JobConf setInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInputFormat.

Prototype

public void setInputFormat(Class<? extends InputFormat> theClass)

Source Link

Document

Set the InputFormat implementation for the map-reduce job.

Usage

From source file:edu.stolaf.cs.wmrserver.HadoopEngine.java

License:Apache License

public void submit(JobRequest request, long submissionID, File mapperFile, File reducerFile, File packageDir,
        Path inputPath) throws ValidationException, NotFoundException, CompilationException, InternalException {
    // Generate job output path
    Path outputDir = new Path(_homeDir, "out");
    Path outputPath;//from   w w w. jav a 2 s  . co m
    try {
        FileSystem fs = outputDir.getFileSystem(new Configuration());
        outputPath = JobServiceHandler.getNonexistantPath(outputDir, request.getName(), fs);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not construct output path.", ex);
    }

    JobConf conf = new JobConf();
    conf.setJobName(request.getName());

    // Set mapper and number of tasks if specified
    StreamJob.setStreamMapper(conf, mapperFile.toString());
    if (request.isSetMapTasks())
        conf.setNumMapTasks(request.getMapTasks());

    // Set reducer and number of tasks if specified
    StreamJob.setStreamReducer(conf, reducerFile.toString());
    if (request.isSetReduceTasks())
        conf.setNumReduceTasks(request.getReduceTasks());

    // Create and set job JAR, including necessary files
    ArrayList<String> jarFiles = new ArrayList<String>();
    jarFiles.add(packageDir.toString());
    String jarPath;
    try {
        jarPath = StreamJob.createJobJar(conf, jarFiles, _tempDir);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not create job jar.", ex);
    }
    if (jarPath != null)
        conf.setJar(jarPath);

    // TODO: This is a hack. Rewrite streaming to use DistributedCache.
    //conf.setPattern("mapreduce.job.jar.unpack.pattern",
    //              Pattern.compile(".*"));

    // Set I/O formats and paths
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // Use numeric sort if appropriate
    conf.setBoolean(CONF_NUMERIC, request.isNumericSort());
    if (request.isNumericSort()) {
        conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
        conf.setPartitionerClass(KeyFieldBasedPartitioner.class);
        conf.setKeyFieldComparatorOptions("-n");
        conf.setKeyFieldPartitionerOptions("-n");
    }

    // Set other job information
    conf.set(CONF_USER, request.getUser());
    conf.set(CONF_LANGUAGE, request.getLanguage());
    conf.set(CONF_MAPPER, request.getMapper());
    conf.set(CONF_REDUCER, request.getReducer());

    // Attempt to submit the job

    RunningJob job;
    try {
        JobClient client = new JobClient(new JobConf());
        job = client.submitJob(conf);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("There was a serious error while attempting to submit the job.",
                ex);
    }

    try {
        SubmissionDatabase.setSubmitted(submissionID);
        SubmissionDatabase.setHadoopID(submissionID, job.getID().toString());
    } catch (SQLException ex) {
        throw JobServiceHandler.wrapException("Could not update submission in database.", ex);
    }
}

From source file:edu.ub.ahstfg.indexer.Indexer.java

License:Open Source License

@Override
public int run(String[] arg0) throws Exception {
    LOG.info("Creating Hadoop job for Indexer.");
    JobConf job = new JobConf(getConf());
    job.setJarByClass(Indexer.class);

    LOG.info("Setting input path to '" + INPUT_PATH + "'");
    FileInputFormat.setInputPaths(job, new Path(INPUT_PATH));
    // Set filters if it's necessary.

    LOG.info("Clearing the output path at '" + OUTPUT_PATH + "'");
    // Change URI to Path if it's necessary.
    FileSystem fs = FileSystem.get(new URI(OUTPUT_PATH), job);

    if (fs.exists(new Path(OUTPUT_PATH))) {
        fs.delete(new Path(OUTPUT_PATH), true);
    }//from  w w w .  ja  v a  2  s  . c om

    LOG.info("Setting output path to '" + OUTPUT_PATH + "'");
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
    FileOutputFormat.setCompressOutput(job, false);

    LOG.info("Setting input format.");
    job.setInputFormat(ArcInputFormat.class);
    LOG.info("Setting output format.");
    job.setOutputFormat(IndexOutputFormat.class);

    LOG.info("Setting output data types.");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexRecord.class);

    LOG.info("Setting mapper and reducer.");
    job.setMapperClass(IndexerMapper.class);
    job.setMapOutputValueClass(ParsedDocument.class);
    job.setReducerClass(IndexerReducer.class);

    if (JobClient.runJob(job).isSuccessful()) {
        return 0;
    } else {
        return 1;
    }
}

From source file:edu.ub.ahstfg.indexer.wordcount.WordCount.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    LOG.info("Creating Hadoop job for ARC input files word count.");
    JobConf job = new JobConf(getConf());
    job.setJarByClass(WordCount.class);

    LOG.info("Setting input path to '" + inputPath + "'");
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    // Set filters if it's necessary.

    LOG.info("Clearing the output path at '" + outputPath + "'");
    // Change URI to Path if it's necessary.
    FileSystem fs = FileSystem.get(new URI(outputPath), job);

    if (fs.exists(new Path(outputPath))) {
        fs.delete(new Path(outputPath), true);
    }/* w w w . ja  v a  2s  . c  o  m*/

    LOG.info("Setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    LOG.info("Setting input format.");
    // job.setInputFormat(TextInputFormat.class);
    job.setInputFormat(ArcInputFormat.class);
    LOG.info("Setting output format.");
    job.setOutputFormat(TextOutputFormat.class);

    LOG.info("Setting output data types.");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    LOG.info("Setting mapper and reducer.");
    // job.setMapperClass(WordCountTextInputMapper.class);
    job.setMapperClass(WordCountArcInputMapper.class);
    job.setReducerClass(LongSumReducer.class);

    if (JobClient.runJob(job).isSuccessful()) {
        return 0;
    } else {
        return 1;
    }
}

From source file:edu.ubc.mirrors.holographs.mapreduce.Driver.java

License:Open Source License

public int run(String[] args) throws Exception {
    JobConf job = new JobConf(getConf());
    job.setClassLoader(Driver.class.getClassLoader());
    job.setInputFormat(SnapshotObjectsOfTypeInputFormat.class);
    job.setMapperClass(InvokeMethodMapper.class);
    job.setCombinerClass(TextCountSumReducer.class);
    job.setReducerClass(TextCountSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.set("snapshotPath", args[0]);
    job.set("targetClassName", "org.eclipse.cdt.internal.core.dom.parser.cpp.CPPASTName");
    job.setInt("splitSize", 10000);
    job.setInt("maxNumObjects", 100000);

    FileInputFormat.addInputPath(job, new Path(args[0]));

    String outputPath = args[1];//  www.j  av a2s . c  o m
    int suffix = 2;
    while (new File(outputPath).exists()) {
        outputPath = args[1] + suffix++;
    }
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    JobClient.runJob(job);
    return 0;
}

From source file:edu.uci.ics.hyracks.imru.dataflow.Hdtest.java

License:Apache License

public static JobSpecification createJob() throws Exception {
    JobSpecification spec = new JobSpecification();
    spec.setFrameSize(4096);//from   ww  w .jav a2s  . c o m

    String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf";
    String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0";
    JobConf conf = new JobConf();
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
    FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
    conf.setInputFormat(TextInputFormat.class);
    RecordDescriptor recordDesc = new RecordDescriptor(
            new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1);
    HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits,
            new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() {
                @Override
                public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
                    return new IKeyValueParser<LongWritable, Text>() {
                        TupleWriter tupleWriter;

                        @Override
                        public void open(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter = new TupleWriter(ctx, writer, 1);
                        }

                        @Override
                        public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString)
                                throws HyracksDataException {
                            try {
                                tupleWriter.write(value.getBytes(), 0, value.getLength());
                                tupleWriter.finishField();
                                tupleWriter.finishTuple();
                            } catch (IOException e) {
                                throw new HyracksDataException(e);
                            }
                        }

                        @Override
                        public void close(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter.close();
                        }
                    };
                }

            });

    // createPartitionConstraint(spec, readOperator, new String[] {"NC0"});
    PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" });

    IOperatorDescriptor writer = new HDFSOD(spec, null, null, null);
    // createPartitionConstraint(spec, writer, outSplits);

    spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0);

    spec.addRoot(writer);
    return spec;
}

From source file:edu.uci.ics.hyracks.imru.jobgen.IMRUJobFactory.java

License:Apache License

public InputSplit[] getInputSplits() throws IOException {
    JobConf conf = getConf();
    FileInputFormat.setInputPaths(conf, inputPaths);
    conf.setInputFormat(HDFSBlockFormat.class);
    return conf.getInputFormat().getSplits(conf, 1);
}

From source file:edu.uci.ics.hyracks.imru.util.DataBalancer.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(DataBalancer.class);

    job.setJobName(DataBalancer.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setReducerClass(ReduceRecordOnly.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(Integer.parseInt(args[2]));

    if (args.length > 3) {
        if (args[3].startsWith("bzip"))
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        if (args[3].startsWith("gz"))
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }//  w  w  w .j av a 2  s . c o  m
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.core.util.DataGenerator.java

License:Apache License

public static void main(String[] args) throws IOException {

    JobConf job = new JobConf(DataGenerator.class);
    FileSystem dfs = FileSystem.get(job);
    String maxFile = "/maxtemp";
    dfs.delete(new Path(maxFile), true);

    job.setJobName(DataGenerator.class.getSimpleName() + "max ID");
    job.setMapperClass(MapMaxId.class);
    job.setCombinerClass(CombineMaxId.class);
    job.setReducerClass(ReduceMaxId.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(VLongWritable.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(maxFile));
    job.setNumReduceTasks(1);//  w ww  .j ava2 s .c  om
    JobClient.runJob(job);

    job = new JobConf(DataGenerator.class);
    job.set("hyracks.maxid.file", maxFile);
    job.setInt("hyracks.x", Integer.parseInt(args[2]));
    dfs.delete(new Path(args[1]), true);

    job.setJobName(DataGenerator.class.getSimpleName());
    job.setMapperClass(MapRecordGen.class);
    job.setReducerClass(ReduceRecordGen.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(Integer.parseInt(args[3]));

    if (args.length > 4) {
        if (args[4].startsWith("bzip"))
            FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
        if (args[4].startsWith("gz"))
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    }
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.example.utils.CommonSource.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(GraphPreProcessor.class);

    job.setJobName(GraphPreProcessor.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setReducerClass(ReduceRecordOnly.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setInputFormat(TextInputFormat.class);
    for (int i = 0; i < args.length - 2; i++) {
        FileInputFormat.addInputPath(job, new Path(args[i]));
    }//from w w w .jav a 2 s .  c o m
    FileOutputFormat.setOutputPath(job, new Path(args[args.length - 2]));
    job.setNumReduceTasks(Integer.parseInt(args[args.length - 1]));
    JobClient.runJob(job);
}

From source file:edu.uci.ics.pregelix.example.utils.DuplicateGraph.java

License:Apache License

public static void main(String[] args) throws IOException {
    JobConf job = new JobConf(DuplicateGraph.class);

    job.setJobName(DuplicateGraph.class.getSimpleName());
    job.setMapperClass(MapRecordOnly.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setNumReduceTasks(0);// w w w.j  a  v a  2 s. co m
    JobClient.runJob(job);
}