Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.datatorrent.demos.mroperator.MapOperator.java

License:Open Source License

@Override
public void beginWindow(long windowId) {
    if (!emitPartitioningCountOnce) {
        outputCount.emit(new KeyHashValPair<Integer, Integer>(operatorId, 1));
        emitPartitioningCountOnce = true;
    }//from w  ww . j a v a2  s. c  o  m
    if (reader == null) {
        try {
            reader = inputFormat.getRecordReader(inputSplit, new JobConf(new Configuration()), reporter);
        } catch (IOException e) {
            logger.info("error getting record reader {}", e.getMessage());
        }
    }
    super.beginWindow(windowId);
}

From source file:com.datatorrent.demos.mroperator.MapOperator.java

License:Open Source License

@Override
public void setup(OperatorContext context) {
    if (context != null) {
        operatorId = context.getId();//from w ww . j  a v  a 2 s . com
    }
    reporter = new ReporterImpl(ReporterType.Mapper, new Counters());
    outputCollector = new OutputCollectorImpl<K2, V2>();
    Configuration conf = new Configuration();
    try {
        inputFormat = inputFormatClass.newInstance();
        SerializationFactory serializationFactory = new SerializationFactory(conf);
        Deserializer keyDesiralizer = serializationFactory.getDeserializer(inputSplitClass);
        keyDesiralizer.open(new ByteArrayInputStream(outstream.toByteArray()));
        inputSplit = (InputSplit) keyDesiralizer.deserialize(null);
        ((ReporterImpl) reporter).setInputSplit(inputSplit);
        reader = inputFormat.getRecordReader(inputSplit, new JobConf(conf), reporter);
    } catch (Exception e) {
        logger.info("failed to initialize inputformat obj {}", inputFormat);
        throw new RuntimeException(e);
    }
    InputStream stream = null;
    if (configFile != null && configFile.length() > 0) {
        stream = ClassLoader.getSystemResourceAsStream("/" + configFile);
        if (stream == null) {
            stream = ClassLoader.getSystemResourceAsStream(configFile);
        }
    }
    if (stream != null) {
        conf.addResource(stream);
    }
    jobConf = new JobConf(conf);
    if (mapClass != null) {
        try {
            mapObject = mapClass.newInstance();
        } catch (Exception e) {
            logger.info("can't instantiate object {}", e.getMessage());
        }

        mapObject.configure(jobConf);
    }
    if (combineClass != null) {
        try {
            combineObject = combineClass.newInstance();
        } catch (Exception e) {
            logger.info("can't instantiate object {}", e.getMessage());
        }
        combineObject.configure(jobConf);
    }
}

From source file:com.datatorrent.demos.mroperator.MapOperator.java

License:Open Source License

@SuppressWarnings("rawtypes")
@Override/* ww  w  .j a  va  2 s  .co m*/
public Collection<Partition<MapOperator<K1, V1, K2, V2>>> definePartitions(
        Collection<Partition<MapOperator<K1, V1, K2, V2>>> partitions, int incrementalCapacity) {
    Collection c = partitions;
    Collection<Partition<MapOperator<K1, V1, K2, V2>>> operatorPartitions = c;
    Partition<MapOperator<K1, V1, K2, V2>> template = null;
    Iterator<Partition<MapOperator<K1, V1, K2, V2>>> itr = operatorPartitions.iterator();
    template = itr.next();
    Configuration conf = new Configuration();
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    if (outstream.size() == 0) {
        InputSplit[] splits;
        try {
            splits = getSplits(new JobConf(conf), incrementalCapacity + 1,
                    template.getPartitionedInstance().getDirName());
        } catch (Exception e1) {
            logger.info(" can't get splits {}", e1.getMessage());
            throw new RuntimeException(e1);
        }
        Collection<Partition<MapOperator<K1, V1, K2, V2>>> operList = new ArrayList<Partition<MapOperator<K1, V1, K2, V2>>>();
        itr = operatorPartitions.iterator();
        int size = splits.length;
        Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
        while (size > 0 && itr.hasNext()) {
            Partition<MapOperator<K1, V1, K2, V2>> p = itr.next();
            MapOperator<K1, V1, K2, V2> opr = p.getPartitionedInstance();
            opr.setInputFormatClass(inputFormatClass);
            opr.setMapClass(mapClass);
            opr.setCombineClass(combineClass);
            opr.setConfigFile(configFile);
            try {
                keySerializer.open(opr.getOutstream());
                keySerializer.serialize(splits[size - 1]);
                opr.setInputSplitClass(splits[size - 1].getClass());
            } catch (IOException e) {
                logger.info("error while serializing {}", e.getMessage());
            }
            size--;
            operList.add(p);
        }
        while (size > 0) {
            MapOperator<K1, V1, K2, V2> opr = new MapOperator<K1, V1, K2, V2>();
            opr.setInputFormatClass(inputFormatClass);
            opr.setMapClass(mapClass);
            opr.setCombineClass(combineClass);
            opr.setConfigFile(configFile);
            try {
                keySerializer.open(opr.getOutstream());
                keySerializer.serialize(splits[size - 1]);
                opr.setInputSplitClass(splits[size - 1].getClass());
            } catch (IOException e) {
                logger.info("error while serializing {}", e.getMessage());
            }
            size--;
            operList.add(new DefaultPartition<MapOperator<K1, V1, K2, V2>>(opr));
        }
        try {
            keySerializer.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return operList;
    }
    return null;
}

From source file:com.datatorrent.demos.mroperator.MapOperatorTest.java

License:Open Source License

public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper)
        throws IOException {

    CollectorTestSink sortSink = new CollectorTestSink();
    oper.output.setSink(sortSink);//w  ww  . j  av a 2 s .  c  o  m

    oper.setMapClass(WordCount.Map.class);
    oper.setCombineClass(WordCount.Reduce.class);
    oper.setDirName("src/test/resources/mroperator/");
    oper.setConfigFile(null);
    oper.setInputFormatClass(TextInputFormat.class);

    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);
    FileInputFormat.setInputPaths(jobConf, new Path("src/test/resources/mroperator/"));
    TextInputFormat inputFormat = new TextInputFormat();
    inputFormat.configure(jobConf);
    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass());
    keySerializer.open(oper.getOutstream());
    keySerializer.serialize(splits[0]);
    oper.setInputSplitClass(splits[0].getClass());
    keySerializer.close();
    oper.setup(null);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.emitTuples();
    oper.endWindow();
    oper.beginWindow(1);
    oper.emitTuples();
    oper.endWindow();

    Assert.assertEquals("number emitted tuples", 6, sortSink.collectedTuples.size());
    for (Object o : sortSink.collectedTuples) {
        logger.debug(o.toString());
    }
    logger.debug("Done testing round\n");
}

From source file:com.datatorrent.demos.mroperator.ReduceOperator.java

License:Open Source License

@Override
public void setup(OperatorContext context) {
    reporter = new ReporterImpl(ReporterType.Reducer, new Counters());
    if (context != null) {
        operatorId = context.getId();//  w ww.  j a  va 2s  .  co  m
    }
    cacheObject = new HashMap<K1, List<V1>>();
    outputCollector = new OutputCollectorImpl<K2, V2>();
    if (reduceClass != null) {
        try {
            reduceObj = reduceClass.newInstance();
        } catch (Exception e) {
            logger.info("can't instantiate object {}", e.getMessage());
            throw new RuntimeException(e);
        }
        Configuration conf = new Configuration();
        InputStream stream = null;
        if (configFile != null && configFile.length() > 0) {
            logger.info("system /{}", configFile);
            stream = ClassLoader.getSystemResourceAsStream("/" + configFile);
            if (stream == null) {
                logger.info("system {}", configFile);
                stream = ClassLoader.getSystemResourceAsStream(configFile);
            }
        }
        if (stream != null) {
            logger.info("found our stream... so adding it");
            conf.addResource(stream);
        }
        reduceObj.configure(new JobConf(conf));
    }

}

From source file:com.datatorrent.demos.mroperator.WordCount.java

License:Open Source License

public void run(String[] args) throws Exception {

    JobConf conf = new JobConf(this.getClass());
    conf.setJobName("wordcount");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//w  w w  . java 2  s.  c om
}

From source file:com.digitalpebble.behemoth.ClassifierJob.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");
    options.addOption("m", "model", true, "location of the model");

    // parse the command line arguments
    CommandLine line = null;//from   w  ww . j a v a 2  s. co  m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        String model = line.getOptionValue("m");
        if (line.hasOption("help")) {
            formatter.printHelp("ClassifierJob", options);
            return 0;
        }
        if (model == null | input == null | output == null) {
            formatter.printHelp("ClassifierJob", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("ClassifierJob", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));
    String modelPath = line.getOptionValue("m");

    JobConf job = new JobConf(getConf());

    // push the model file to the DistributedCache
    DistributedCache.addCacheArchive(new URI(modelPath), job);

    job.setJarByClass(this.getClass());

    job.setJobName("ClassifierJob : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(TextClassifierMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.set(modelNameParam, modelPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.commoncrawl.CorpusMerger.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/*from  w ww.j  a  v  a  2s . c  o m*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusMerger", options);
            return 0;
        }
        if (input == null) {
            formatter.printHelp("CorpusMerger", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusMerger", options);
        return -1;
    }

    Path outputPath = new Path(line.getOptionValue("o"));

    String[] paths = (line.getOptionValues("i"));

    JobConf job = new JobConf(getConf());
    // MUST not forget the line below
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusMerger");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    //job.setMapperClass(IdentityMapper.class);

    job.setReducerClass(MergerReducer.class);

    for (String in : paths)
        FileInputFormat.addInputPath(job, new Path(in));

    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("CorpusMerger completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception caught", e);
        // fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.es.ESIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length != 1) {
        String syntax = "com.digitalpebble.behemoth.ESIndexerJob input";
        System.err.println(syntax);
        return -1;
    }/*  w  w w.j av a2s.com*/

    Path inputPath = new Path(args[0]);

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into ElasticSearch");

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapOutputValueClass(MapWritable.class);

    job.setMapperClass(BehemothToESMapper.class);

    job.setSpeculativeExecution(false); // disable speculative execution
    // when writing to ES

    // job.set("es.resource", "radio/artists"); // index used for storing
    // data
    job.setOutputFormat(EsOutputFormat.class); // use dedicated output
    // format

    FileInputFormat.addInputPath(job, inputPath);

    // no reducer : send straight to elasticsearch at end of mapping
    job.setNumReduceTasks(0);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("ESIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception while running job", e);
        return -1;
    }
    return 0;
}

From source file:com.digitalpebble.behemoth.gate.GATEDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length < 3 | args.length > 4) {
        String syntax = "com.digitalpebble.behemoth.gate.GATEDriver in out path_gate_file [-XML]";
        System.err.println(syntax);
        return -1;
    }/*  w ww . j a v  a2 s .  c  o m*/

    boolean dumpGATEXML = false;

    for (String arg : args) {
        if (arg.equalsIgnoreCase("-xml"))
            dumpGATEXML = true;
    }

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String zip_application_path = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(zip_application_path);
    if (fs.exists(zap) == false) {
        System.err
                .println("The GATE application " + zip_application_path + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    // MUST not forget the line below
    job.setJarByClass(this.getClass());

    job.setJobName("Processing " + args[0] + " with GATE application from " + zip_application_path);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setOutputKeyClass(Text.class);

    if (dumpGATEXML) {
        job.setOutputValueClass(Text.class);
        job.setMapperClass(GATEXMLMapper.class);
    } else {
        job.setOutputValueClass(BehemothDocument.class);
        job.setMapperClass(GATEMapper.class);
    }

    // detect if any filters have been defined
    // and activate the reducer accordingly
    boolean isFilterRequired = BehemothReducer.isRequired(job);
    if (isFilterRequired)
        job.setReducerClass(BehemothReducer.class);
    else {
        job.setNumReduceTasks(0);
    }

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the zipped_gate_application onto the DistributedCache
    DistributedCache.addCacheArchive(new URI(zip_application_path), job);

    job.set("gate.application.path", zip_application_path.toString());

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("GATEDriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception caught", e);
        // leave even partial output
        // fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}