Example usage for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration()

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java

License:Open Source License

/**
 * Generate random data, compress it, index and md5 hash the data.
 * Then read it all back and md5 that too, to verify that it all went ok.
 * /*from w  w w . j  a va2  s .com*/
 * @param testWithIndex Should we index or not?
 * @param charsToOutput How many characters of random data should we output.
 * @throws IOException
 * @throws NoSuchAlgorithmException
 * @throws InterruptedException
 */
private void runTest(boolean testWithIndex, int charsToOutput)
        throws IOException, NoSuchAlgorithmException, InterruptedException {

    if (!GPLNativeCodeLoader.isNativeCodeLoaded()) {
        LOG.warn("Cannot run this test without the native lzo libraries");
        return;
    }

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir, true);
    localFs.mkdirs(outputDir);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
            new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
        Path lzoFile = new Path(outputDir, lzoFileName);
        LzoTextInputFormat.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir);

    List<InputSplit> is = inputFormat.getSplits(job);
    //verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
        assertEquals(3, is.size());
    } else {
        assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
        RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext);
        rr.initialize(inputSplit, attemptContext);

        while (rr.nextKeyValue()) {
            Text value = rr.getCurrentValue();

            md5.update(value.getBytes(), 0, value.getLength());
        }

        rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5.digest()));
}

From source file:com.hhscyber.nl.tweets.svm.test.Tester.java

/**
 * @param args the command line arguments
 * @throws java.io.IOException/*from w  ww .j a va  2s .  c o  m*/
 */
public static void main(String[] args) throws IOException, Exception {
    Conf conf = new Conf(args, "");
    Job job = new HBJob(conf, "TweetsSVMTester");

    job.setJarByClass(Tester.class);
    Scan scan = new Scan();

    TableMapReduceUtil.initTableMapperJob("hhscyber:tweets_lang", scan, TestMapper.class,
            ImmutableBytesWritable.class, Put.class, job);

    job.setOutputFormatClass(MultiTableOutputFormat.class);
    job.setReducerClass(TestReducer.class);
    job.setNumReduceTasks(2);
    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.addDependencyJars(job.getConfiguration());

    job.waitForCompletion(true);
}

From source file:com.hortonworks.pso.data.generator.mapreduce.DataGenTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(getConf()); // new Job(conf, this.getClass().getCanonicalName());

    //        Configuration conf = getConf();

    int mappers = 2;
    String output = null;//  w  ww.  j  a  v a  2  s .co m
    String config = null;
    long count = 100;

    List<String> otherArgs = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-mappers".equals(args[i])) {
                mappers = Integer.parseInt(args[++i]);
                otherArgs.add("-Dmapreduce.job.maps=" + Integer.toString(mappers));
            } else if ("-output".equals(args[i])) {
                output = args[++i];
            } else if ("-json.cfg".equals(args[i])) {
                config = args[++i];
            } else if ("-count".equals(args[i])) {
                count = Long.parseLong(args[++i]);
            } else {
                otherArgs.add(args[i]);
            }

        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    job.getConfiguration().set("json.cfg", config);

    String[] altArgs = new String[otherArgs.size()];
    otherArgs.toArray(altArgs);

    GenericOptionsParser gop = new GenericOptionsParser(job.getConfiguration(), altArgs);

    DataGenInputFormat.setNumberOfRows(job, count);

    job.setJarByClass(DataGenTool.class);

    Path output_path = new Path(output);

    if (output_path.getFileSystem(getConf()).exists(output_path)) {
        throw new IOException("Output directory " + output_path + " already exists.");
    }

    FileOutputFormat.setOutputPath(job, output_path);

    job.setMapperClass(DataGenMapper.class);
    // Map Only Job
    job.setNumReduceTasks(0);
    //        job.setReducerClass(RerateReducer.class);

    job.setInputFormatClass(DataGenInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    //        job.setOutputKeyClass(Text.class);
    //        job.setOutputValueClass(Text.class);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:com.iflytek.spider.crawl.CrawlDb.java

License:Apache License

public void update(Path crawlDb, Path[] segments, boolean additionsAllowed, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    FileSystem fs = FileSystem.get(getConf());
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.createLockFile(fs, lock, force);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: starting");
        LOG.info("CrawlDb update: db: " + crawlDb);
        LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
        LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
    }/*from w ww . j a  v  a 2 s .c  o m*/

    Job job = CrawlDb.createJob(getConf(), crawlDb);
    job.getConfiguration().setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
    for (int i = 0; i < segments.length; i++) {
        Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
        Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
        if (fs.exists(fetch)) {
            FileInputFormat.addInputPath(job, fetch);
        }
        if (fs.exists(parse)) {
            FileInputFormat.addInputPath(job, parse);
        } else {
            LOG.info(" - skipping invalid segment " + segments[i]);
        }
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: Merging segment data into db.");
    }
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    } catch (InterruptedException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    } catch (ClassNotFoundException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    }

    CrawlDb.install(job, crawlDb);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: done");
    }
}

From source file:com.iflytek.spider.crawl.CrawlDb.java

License:Apache License

public static void install(Job job, Path crawlDb) throws IOException {
    Path newCrawlDb = FileOutputFormat.getOutputPath(job);
    FileSystem fs = FileSystem.get(job.getConfiguration());
    Path old = new Path(crawlDb, "old");
    Path current = new Path(crawlDb, CURRENT_NAME);
    if (fs.exists(current)) {
        if (fs.exists(old))
            fs.delete(old, true);/*from  w  w  w .  j  av a2 s  .  co m*/
        fs.rename(current, old);
    }
    fs.mkdirs(crawlDb);
    fs.rename(newCrawlDb, current);
    if (fs.exists(old))
        fs.delete(old, true);
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.removeLockFile(fs, lock);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation.//ww  w  .j  a  va 2s .co  m
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists)
        throws IOException, InterruptedException, ClassNotFoundException {
    // invert again, partition by host/domain/IP, sort by url hash
    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir);
    }//from ww  w .  jav a2 s  . co  m
    Path segment = new Path(segmentsDir, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);

    LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers");

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("generate: partition " + segment);
    job.getConfiguration().setInt("partition.url.seed", new Random().nextInt());

    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorInverseMapper.class);
    job.setPartitionerClass(AveragePartition.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(SelectorEntry.class);
    job.setReducerClass(PartitionReducer.class);
    job.setNumReduceTasks(numLists);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.waitForCompletion(true);
    return segment;
}

From source file:com.iflytek.spider.parse.ParseSegment.java

License:Apache License

public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Parse: starting");
        LOG.info("Parse: segment: " + segment);
    }//w  w  w  . j  a  v  a2s. c  om

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("parse " + segment);

    FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName());

    job.setInputFormatClass(AvroPairInputFormat.class);
    job.setMapperClass(ParseMapper.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormatClass(ParseOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(UnionData.class);

    job.waitForCompletion(true);
    if (LOG.isInfoEnabled()) {
        LOG.info("Parse: done");
    }
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.Aleph2MultiInputFormatBuilder.java

License:Apache License

/** Sets the output configurations in the job 
 * @param job/*w  w  w  .  j a  v  a  2s.  co  m*/
 */
public Job build(final Job job) {

    job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_JOBS,
            _inputs.keySet().stream().collect(Collectors.joining(",")));
    _inputs.entrySet().stream().forEach(Lambdas.wrap_consumer_u(kv -> {
        try (final Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(
                job.getConfiguration(), Configuration.class)) {
            final Configuration new_config = new Configuration(kv.getValue().getConfiguration());
            new_config.set(ALEPH2_MULTI_INPUT_FORMAT_CLAZZ, kv.getValue().getInputFormatClass().getName());
            job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_PREFIX + kv.getKey(),
                    stringifier.toString(new_config));
        }
    }));
    job.setInputFormatClass(Aleph2MultiInputFormat.class);
    return job;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.UpdatedFileInputFormat.java

License:Apache License

/**
 * @param job//from www .j a v  a 2 s .c o m
 *          the job to modify
 * @param inputDirRecursive
 */
public static void setInputDirRecursive(Job job, boolean inputDirRecursive) {
    job.getConfiguration().setBoolean(INPUT_DIR_RECURSIVE, inputDirRecursive);
}