Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults)

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.datatorrent.demos.mroperator.MapOperator.java

License:Open Source License

@Override
public void beginWindow(long windowId) {
    if (!emitPartitioningCountOnce) {
        outputCount.emit(new KeyHashValPair<Integer, Integer>(operatorId, 1));
        emitPartitioningCountOnce = true;
    }//from w  ww . j a v a2  s. c  o  m
    if (reader == null) {
        try {
            reader = inputFormat.getRecordReader(inputSplit, new JobConf(new Configuration()), reporter);
        } catch (IOException e) {
            logger.info("error getting record reader {}", e.getMessage());
        }
    }
    super.beginWindow(windowId);
}

From source file:com.datatorrent.demos.mroperator.MapOperator.java

License:Open Source License

@Override
public void setup(OperatorContext context) {
    if (context != null) {
        operatorId = context.getId();//from w ww . j  a v  a 2 s . com
    }
    reporter = new ReporterImpl(ReporterType.Mapper, new Counters());
    outputCollector = new OutputCollectorImpl<K2, V2>();
    Configuration conf = new Configuration();
    try {
        inputFormat = inputFormatClass.newInstance();
        SerializationFactory serializationFactory = new SerializationFactory(conf);
        Deserializer keyDesiralizer = serializationFactory.getDeserializer(inputSplitClass);
        keyDesiralizer.open(new ByteArrayInputStream(outstream.toByteArray()));
        inputSplit = (InputSplit) keyDesiralizer.deserialize(null);
        ((ReporterImpl) reporter).setInputSplit(inputSplit);
        reader = inputFormat.getRecordReader(inputSplit, new JobConf(conf), reporter);
    } catch (Exception e) {
        logger.info("failed to initialize inputformat obj {}", inputFormat);
        throw new RuntimeException(e);
    }
    InputStream stream = null;
    if (configFile != null && configFile.length() > 0) {
        stream = ClassLoader.getSystemResourceAsStream("/" + configFile);
        if (stream == null) {
            stream = ClassLoader.getSystemResourceAsStream(configFile);
        }
    }
    if (stream != null) {
        conf.addResource(stream);
    }
    jobConf = new JobConf(conf);
    if (mapClass != null) {
        try {
            mapObject = mapClass.newInstance();
        } catch (Exception e) {
            logger.info("can't instantiate object {}", e.getMessage());
        }

        mapObject.configure(jobConf);
    }
    if (combineClass != null) {
        try {
            combineObject = combineClass.newInstance();
        } catch (Exception e) {
            logger.info("can't instantiate object {}", e.getMessage());
        }
        combineObject.configure(jobConf);
    }
}

From source file:com.datatorrent.demos.mroperator.MapOperator.java

License:Open Source License

@SuppressWarnings("rawtypes")
@Override/* ww  w  .j a  va  2 s  .co m*/
public Collection<Partition<MapOperator<K1, V1, K2, V2>>> definePartitions(
        Collection<Partition<MapOperator<K1, V1, K2, V2>>> partitions, int incrementalCapacity) {
    Collection c = partitions;
    Collection<Partition<MapOperator<K1, V1, K2, V2>>> operatorPartitions = c;
    Partition<MapOperator<K1, V1, K2, V2>> template = null;
    Iterator<Partition<MapOperator<K1, V1, K2, V2>>> itr = operatorPartitions.iterator();
    template = itr.next();
    Configuration conf = new Configuration();
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    if (outstream.size() == 0) {
        InputSplit[] splits;
        try {
            splits = getSplits(new JobConf(conf), incrementalCapacity + 1,
                    template.getPartitionedInstance().getDirName());
        } catch (Exception e1) {
            logger.info(" can't get splits {}", e1.getMessage());
            throw new RuntimeException(e1);
        }
        Collection<Partition<MapOperator<K1, V1, K2, V2>>> operList = new ArrayList<Partition<MapOperator<K1, V1, K2, V2>>>();
        itr = operatorPartitions.iterator();
        int size = splits.length;
        Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
        while (size > 0 && itr.hasNext()) {
            Partition<MapOperator<K1, V1, K2, V2>> p = itr.next();
            MapOperator<K1, V1, K2, V2> opr = p.getPartitionedInstance();
            opr.setInputFormatClass(inputFormatClass);
            opr.setMapClass(mapClass);
            opr.setCombineClass(combineClass);
            opr.setConfigFile(configFile);
            try {
                keySerializer.open(opr.getOutstream());
                keySerializer.serialize(splits[size - 1]);
                opr.setInputSplitClass(splits[size - 1].getClass());
            } catch (IOException e) {
                logger.info("error while serializing {}", e.getMessage());
            }
            size--;
            operList.add(p);
        }
        while (size > 0) {
            MapOperator<K1, V1, K2, V2> opr = new MapOperator<K1, V1, K2, V2>();
            opr.setInputFormatClass(inputFormatClass);
            opr.setMapClass(mapClass);
            opr.setCombineClass(combineClass);
            opr.setConfigFile(configFile);
            try {
                keySerializer.open(opr.getOutstream());
                keySerializer.serialize(splits[size - 1]);
                opr.setInputSplitClass(splits[size - 1].getClass());
            } catch (IOException e) {
                logger.info("error while serializing {}", e.getMessage());
            }
            size--;
            operList.add(new DefaultPartition<MapOperator<K1, V1, K2, V2>>(opr));
        }
        try {
            keySerializer.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return operList;
    }
    return null;
}

From source file:com.datatorrent.demos.mroperator.MapOperatorTest.java

License:Open Source License

public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper)
        throws IOException {

    CollectorTestSink sortSink = new CollectorTestSink();
    oper.output.setSink(sortSink);//w  ww  . j  av a 2 s .  c  o  m

    oper.setMapClass(WordCount.Map.class);
    oper.setCombineClass(WordCount.Reduce.class);
    oper.setDirName("src/test/resources/mroperator/");
    oper.setConfigFile(null);
    oper.setInputFormatClass(TextInputFormat.class);

    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);
    FileInputFormat.setInputPaths(jobConf, new Path("src/test/resources/mroperator/"));
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(jobConf);
    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
    keySerializer.open(oper.getOutstream());
    keySerializer.serialize(splits[0]);
    oper.setInputSplitClass(splits[0].getClass());
    keySerializer.close();
    oper.setup(null);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.emitTuples();
    oper.endWindow();
    oper.beginWindow(1);
    oper.emitTuples();
    oper.endWindow();

    Assert.assertEquals("number emitted tuples", 6, sortSink.collectedTuples.size());
    for (Object o : sortSink.collectedTuples) {
        logger.debug(o.toString());
    }
    logger.debug("Done testing round\n");
}

From source file:com.datatorrent.demos.mroperator.ReduceOperator.java

License:Open Source License

@Override
public void setup(OperatorContext context) {
    reporter = new ReporterImpl(ReporterType.Reducer, new Counters());
    if (context != null) {
        operatorId = context.getId();//  w ww.  j a  va 2s  .  co  m
    }
    cacheObject = new HashMap<K1, List<V1>>();
    outputCollector = new OutputCollectorImpl<K2, V2>();
    if (reduceClass != null) {
        try {
            reduceObj = reduceClass.newInstance();
        } catch (Exception e) {
            logger.info("can't instantiate object {}", e.getMessage());
            throw new RuntimeException(e);
        }
        Configuration conf = new Configuration();
        InputStream stream = null;
        if (configFile != null && configFile.length() > 0) {
            logger.info("system /{}", configFile);
            stream = ClassLoader.getSystemResourceAsStream("/" + configFile);
            if (stream == null) {
                logger.info("system {}", configFile);
                stream = ClassLoader.getSystemResourceAsStream(configFile);
            }
        }
        if (stream != null) {
            logger.info("found our stream... so adding it");
            conf.addResource(stream);
        }
        reduceObj.configure(new JobConf(conf));
    }

}

From source file:com.datatorrent.demos.mroperator.WordCount.java

License:Open Source License

public void run(String[] args) throws Exception {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//w  w w  . java 2  s.  c om
}

From source file:com.digitalpebble.behemoth.ClassifierJob.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");
    options.addOption("m", "model", true, "location of the model");

    // parse the command line arguments
    CommandLine line = null;//from   w  ww . j a v a 2  s. co  m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        String model = line.getOptionValue("m");
        if (line.hasOption("help")) {
            formatter.printHelp("ClassifierJob", options);
            return 0;
        }
        if (model == null | input == null | output == null) {
            formatter.printHelp("ClassifierJob", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("ClassifierJob", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));
    String modelPath = line.getOptionValue("m");

    JobConf job = new JobConf(getConf());

    // push the model file to the DistributedCache
    DistributedCache.addCacheArchive(new URI(modelPath), job);

    job.setJarByClass(this.getClass());

    job.setJobName("ClassifierJob : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(TextClassifierMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.set(modelNameParam, modelPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.commoncrawl.CorpusMerger.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/*from  w ww.j  a  v  a  2s . c  o m*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusMerger", options);
            return 0;
        }
        if (input == null) {
            formatter.printHelp("CorpusMerger", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusMerger", options);
        return -1;
    }

    Path outputPath = new Path(line.getOptionValue("o"));

    String[] paths = (line.getOptionValues("i"));

    JobConf job = new JobConf(getConf());
    // MUST not forget the line below
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusMerger");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    //job.setMapperClass(IdentityMapper.class);

    job.setReducerClass(MergerReducer.class);

    for (String in : paths)
        FileInputFormat.addInputPath(job, new Path(in));

    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("CorpusMerger completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception caught", e);
        // fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.es.ESIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length != 1) {
        String syntax = "com.digitalpebble.behemoth.ESIndexerJob input";
        System.err.println(syntax);
        return -1;
    }/*  w  w w.j av a2s.com*/

    Path inputPath = new Path(args[0]);

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into ElasticSearch");

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapOutputValueClass(MapWritable.class);

    job.setMapperClass(BehemothToESMapper.class);

    job.setSpeculativeExecution(false); // disable speculative execution
    // when writing to ES

    // job.set("es.resource", "radio/artists"); // index used for storing
    // data
    job.setOutputFormat(EsOutputFormat.class); // use dedicated output
    // format

    FileInputFormat.addInputPath(job, inputPath);

    // no reducer : send straight to elasticsearch at end of mapping
    job.setNumReduceTasks(0);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ESIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception while running job", e);
        return -1;
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.gate.GATEDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length < 3 | args.length > 4) {
        String syntax = "com.digitalpebble.behemoth.gate.GATEDriver in out path_gate_file [-XML]";
        System.err.println(syntax);
        return -1;
    }/*  w ww . j a v  a2 s .  c  o m*/

    boolean dumpGATEXML = false;

    for (String arg : args) {
        if (arg.equalsIgnoreCase("-xml"))
            dumpGATEXML = true;
    }

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String zip_application_path = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(zip_application_path);
    if (fs.exists(zap) == false) {
        System.err
                .println("The GATE application " + zip_application_path + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    // MUST not forget the line below
    job.setJarByClass(this.getClass());

    job.setJobName("Processing " + args[0] + " with GATE application from " + zip_application_path);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(Text.class);

    if (dumpGATEXML) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(GATEXMLMapper.class);
    } else {
        job.setOutputValueClass(BehemothDocument.class);
        job.setMapperClass(GATEMapper.class);
    }

    // detect if any filters have been defined
    // and activate the reducer accordingly
    boolean isFilterRequired = BehemothReducer.isRequired(job);
    if (isFilterRequired)
        job.setReducerClass(BehemothReducer.class);
    else {
        job.setNumReduceTasks(0);
    }

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the zipped_gate_application onto the DistributedCache
    DistributedCache.addCacheArchive(new URI(zip_application_path), job);

    job.set("gate.application.path", zip_application_path.toString());

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("GATEDriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception caught", e);
        // leave even partial output
        // fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}