Example usage for org.apache.hadoop.mapreduce Job getJobID

List of usage examples for org.apache.hadoop.mapreduce Job getJobID

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getJobID.

Prototype

public JobID getJobID() 

Source Link

Document

Get the unique ID for the job.

Usage

From source file:org.apache.hcatalog.templeton.tool.TempletonControllerJob.java

License:Apache License

/**
 * Enqueue the job and print out the job id for later collection.
 *///from  ww w .j a  v a2s.  c  o  m
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = getConf();
    conf.set(JAR_ARGS_NAME, TempletonUtils.encodeArray(args));
    conf.set("user.name", UserGroupInformation.getCurrentUser().getShortUserName());
    Job job = new Job(conf);
    job.setJarByClass(TempletonControllerJob.class);
    job.setJobName("TempletonControllerJob");
    job.setMapperClass(LaunchMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(SingleInputFormat.class);
    NullOutputFormat<NullWritable, NullWritable> of = new NullOutputFormat<NullWritable, NullWritable>();
    job.setOutputFormatClass(of.getClass());
    job.setNumReduceTasks(0);

    JobClient jc = new JobClient(new JobConf(job.getConfiguration()));

    Token<DelegationTokenIdentifier> mrdt = jc.getDelegationToken(new Text("mr token"));
    job.getCredentials().addToken(new Text("mr token"), mrdt);
    job.submit();

    submittedJobId = job.getJobID();

    return 0;
}

From source file:org.apache.hive.hcatalog.templeton.tool.TempletonControllerJob.java

License:Apache License

/**
 * Enqueue the job and print out the job id for later collection.
 * @see org.apache.hive.hcatalog.templeton.CompleteDelegator
 *//*from w  ww  .jav a 2 s  .c om*/
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException, TException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("Preparing to submit job: " + Arrays.toString(args));
    }
    Configuration conf = getConf();

    conf.set(JAR_ARGS_NAME, TempletonUtils.encodeArray(args));
    String memoryMb = appConf.mapperMemoryMb();
    if (memoryMb != null && memoryMb.length() != 0) {
        conf.set(AppConfig.HADOOP_MAP_MEMORY_MB, memoryMb);
    }
    String amMemoryMB = appConf.amMemoryMb();
    if (amMemoryMB != null && !amMemoryMB.isEmpty()) {
        conf.set(AppConfig.HADOOP_MR_AM_MEMORY_MB, amMemoryMB);
    }
    String amJavaOpts = appConf.controllerAMChildOpts();
    if (amJavaOpts != null && !amJavaOpts.isEmpty()) {
        conf.set(AppConfig.HADOOP_MR_AM_JAVA_OPTS, amJavaOpts);
    }

    String user = UserGroupInformation.getCurrentUser().getShortUserName();
    conf.set("user.name", user);
    Job job = new Job(conf);
    job.setJarByClass(LaunchMapper.class);
    job.setJobName(TempletonControllerJob.class.getSimpleName());
    job.setMapperClass(LaunchMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(SingleInputFormat.class);

    NullOutputFormat<NullWritable, NullWritable> of = new NullOutputFormat<NullWritable, NullWritable>();
    job.setOutputFormatClass(of.getClass());
    job.setNumReduceTasks(0);

    JobClient jc = new JobClient(new JobConf(job.getConfiguration()));

    if (UserGroupInformation.isSecurityEnabled()) {
        Token<DelegationTokenIdentifier> mrdt = jc.getDelegationToken(new Text("mr token"));
        job.getCredentials().addToken(new Text("mr token"), mrdt);
    }
    String metastoreTokenStrForm = addHMSToken(job, user);

    job.submit();

    submittedJobId = job.getJobID();
    if (metastoreTokenStrForm != null) {
        //so that it can be cancelled later from CompleteDelegator
        DelegationTokenCache.getStringFormTokenCache().storeDelegationToken(submittedJobId.toString(),
                metastoreTokenStrForm);
        LOG.debug("Added metastore delegation token for jobId=" + submittedJobId.toString() + " user=" + user);
    }
    return 0;
}

From source file:org.apache.ignite.client.hadoop.GridHadoopClientProtocolSelfTest.java

License:Apache License

/**
 * Test job submission./*  www . j av a 2  s.c  o m*/
 *
 * @param noCombiners Whether there are no combiners.
 * @param noReducers Whether there are no reducers.
 * @throws Exception If failed.
 */
public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception {
    IgniteFs igfs = grid(0).fileSystem(GridHadoopAbstractSelfTest.igfsName);

    igfs.mkdirs(new IgfsPath(PATH_INPUT));

    try (BufferedWriter bw = new BufferedWriter(
            new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) {

        bw.write("word");
    }

    Configuration conf = config(GridHadoopAbstractSelfTest.REST_PORT);

    final Job job = Job.getInstance(conf);

    job.setJobName(JOB_NAME);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(TestMapper.class);
    job.setReducerClass(TestReducer.class);

    if (!noCombiners)
        job.setCombinerClass(TestCombiner.class);

    if (noReducers)
        job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TestOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(PATH_INPUT));
    FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT));

    job.submit();

    JobID jobId = job.getJobID();

    // Setup phase.
    JobStatus jobStatus = job.getStatus();
    checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
    assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f;
    assert jobStatus.getMapProgress() == 0.0f;
    assert jobStatus.getReduceProgress() == 0.0f;

    U.sleep(2100);

    JobStatus recentJobStatus = job.getStatus();

    assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old="
            + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress();

    // Transferring to map phase.
    setupLockFile.delete();

    assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
        @Override
        public boolean apply() {
            try {
                return F.eq(1.0f, job.getStatus().getSetupProgress());
            } catch (Exception e) {
                throw new RuntimeException("Unexpected exception.", e);
            }
        }
    }, 5000L);

    // Map phase.
    jobStatus = job.getStatus();
    checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
    assert jobStatus.getSetupProgress() == 1.0f;
    assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f;
    assert jobStatus.getReduceProgress() == 0.0f;

    U.sleep(2100);

    recentJobStatus = job.getStatus();

    assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old=" + jobStatus.getMapProgress()
            + ", new=" + recentJobStatus.getMapProgress();

    // Transferring to reduce phase.
    mapLockFile.delete();

    assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
        @Override
        public boolean apply() {
            try {
                return F.eq(1.0f, job.getStatus().getMapProgress());
            } catch (Exception e) {
                throw new RuntimeException("Unexpected exception.", e);
            }
        }
    }, 5000L);

    if (!noReducers) {
        // Reduce phase.
        jobStatus = job.getStatus();
        checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
        assert jobStatus.getSetupProgress() == 1.0f;
        assert jobStatus.getMapProgress() == 1.0f;
        assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f;

        // Ensure that reduces progress increases.
        U.sleep(2100);

        recentJobStatus = job.getStatus();

        assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old="
                + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress();

        reduceLockFile.delete();
    }

    job.waitForCompletion(false);

    jobStatus = job.getStatus();
    checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f);
    assert jobStatus.getSetupProgress() == 1.0f;
    assert jobStatus.getMapProgress() == 1.0f;
    assert jobStatus.getReduceProgress() == 1.0f;

    dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT));
}

From source file:org.apache.ignite.client.hadoop.HadoopClientProtocolSelfTest.java

License:Apache License

/**
 * Test job submission.// ww w  .ja  va  2s . co m
 *
 * @param noCombiners Whether there are no combiners.
 * @param noReducers Whether there are no reducers.
 * @throws Exception If failed.
 */
public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception {
    IgniteFileSystem igfs = grid(0).fileSystem(HadoopAbstractSelfTest.igfsName);

    igfs.mkdirs(new IgfsPath(PATH_INPUT));

    try (BufferedWriter bw = new BufferedWriter(
            new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) {

        bw.write("word");
    }

    Configuration conf = config(HadoopAbstractSelfTest.REST_PORT);

    final Job job = Job.getInstance(conf);

    job.setJobName(JOB_NAME);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(TestMapper.class);
    job.setReducerClass(TestReducer.class);

    if (!noCombiners)
        job.setCombinerClass(TestCombiner.class);

    if (noReducers)
        job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TestOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(PATH_INPUT));
    FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT));

    job.submit();

    JobID jobId = job.getJobID();

    // Setup phase.
    JobStatus jobStatus = job.getStatus();
    checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
    assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f;
    assert jobStatus.getMapProgress() == 0.0f;
    assert jobStatus.getReduceProgress() == 0.0f;

    U.sleep(2100);

    JobStatus recentJobStatus = job.getStatus();

    assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old="
            + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress();

    // Transferring to map phase.
    setupLockFile.delete();

    assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
        @Override
        public boolean apply() {
            try {
                return F.eq(1.0f, job.getStatus().getSetupProgress());
            } catch (Exception e) {
                throw new RuntimeException("Unexpected exception.", e);
            }
        }
    }, 5000L);

    // Map phase.
    jobStatus = job.getStatus();
    checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
    assert jobStatus.getSetupProgress() == 1.0f;
    assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f;
    assert jobStatus.getReduceProgress() == 0.0f;

    U.sleep(2100);

    recentJobStatus = job.getStatus();

    assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old=" + jobStatus.getMapProgress()
            + ", new=" + recentJobStatus.getMapProgress();

    // Transferring to reduce phase.
    mapLockFile.delete();

    assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
        @Override
        public boolean apply() {
            try {
                return F.eq(1.0f, job.getStatus().getMapProgress());
            } catch (Exception e) {
                throw new RuntimeException("Unexpected exception.", e);
            }
        }
    }, 5000L);

    if (!noReducers) {
        // Reduce phase.
        jobStatus = job.getStatus();
        checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
        assert jobStatus.getSetupProgress() == 1.0f;
        assert jobStatus.getMapProgress() == 1.0f;
        assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f;

        // Ensure that reduces progress increases.
        U.sleep(2100);

        recentJobStatus = job.getStatus();

        assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old="
                + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress();

        reduceLockFile.delete();
    }

    job.waitForCompletion(false);

    jobStatus = job.getStatus();
    checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f);
    assert jobStatus.getSetupProgress() == 1.0f;
    assert jobStatus.getMapProgress() == 1.0f;
    assert jobStatus.getReduceProgress() == 1.0f;

    dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT));
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.client.HadoopClientProtocolSelfTest.java

License:Apache License

/**
 * Test job submission.//from w ww .  ja  v a 2 s.  c o m
 *
 * @param noCombiners Whether there are no combiners.
 * @param noReducers Whether there are no reducers.
 * @throws Exception If failed.
 */
public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception {
    IgniteFileSystem igfs = grid(0).fileSystem(HadoopAbstractSelfTest.igfsName);

    igfs.mkdirs(new IgfsPath(PATH_INPUT));

    try (BufferedWriter bw = new BufferedWriter(
            new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) {

        bw.write("word");
    }

    Configuration conf = config(HadoopAbstractSelfTest.REST_PORT);

    final Job job = Job.getInstance(conf);

    try {
        job.setJobName(JOB_NAME);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(TestMapper.class);
        job.setReducerClass(TestReducer.class);

        if (!noCombiners)
            job.setCombinerClass(TestCombiner.class);

        if (noReducers)
            job.setNumReduceTasks(0);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TestOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(PATH_INPUT));
        FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT));

        job.submit();

        JobID jobId = job.getJobID();

        // Setup phase.
        JobStatus jobStatus = job.getStatus();
        checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
        assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f;
        assert jobStatus.getMapProgress() == 0.0f;
        assert jobStatus.getReduceProgress() == 0.0f;

        U.sleep(2100);

        JobStatus recentJobStatus = job.getStatus();

        assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old="
                + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress();

        // Transferring to map phase.
        setupLockFile.delete();

        assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
            @Override
            public boolean apply() {
                try {
                    return F.eq(1.0f, job.getStatus().getSetupProgress());
                } catch (Exception e) {
                    throw new RuntimeException("Unexpected exception.", e);
                }
            }
        }, 5000L);

        // Map phase.
        jobStatus = job.getStatus();
        checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
        assert jobStatus.getSetupProgress() == 1.0f;
        assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f;
        assert jobStatus.getReduceProgress() == 0.0f;

        U.sleep(2100);

        recentJobStatus = job.getStatus();

        assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old="
                + jobStatus.getMapProgress() + ", new=" + recentJobStatus.getMapProgress();

        // Transferring to reduce phase.
        mapLockFile.delete();

        assert GridTestUtils.waitForCondition(new GridAbsPredicate() {
            @Override
            public boolean apply() {
                try {
                    return F.eq(1.0f, job.getStatus().getMapProgress());
                } catch (Exception e) {
                    throw new RuntimeException("Unexpected exception.", e);
                }
            }
        }, 5000L);

        if (!noReducers) {
            // Reduce phase.
            jobStatus = job.getStatus();
            checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f);
            assert jobStatus.getSetupProgress() == 1.0f;
            assert jobStatus.getMapProgress() == 1.0f;
            assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f;

            // Ensure that reduces progress increases.
            U.sleep(2100);

            recentJobStatus = job.getStatus();

            assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old="
                    + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress();

            reduceLockFile.delete();
        }

        job.waitForCompletion(false);

        jobStatus = job.getStatus();
        checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f);
        assert jobStatus.getSetupProgress() == 1.0f;
        assert jobStatus.getMapProgress() == 1.0f;
        assert jobStatus.getReduceProgress() == 1.0f;

        dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT));
    } finally {
        job.getCluster().close();
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java

License:Apache License

/**
 * Runs a test with a single input/*w ww  .java  2 s . c  om*/
 * 
 * @param config
 *            Configuration
 * @param input
 *            Input
 * @param expectedTuples
 *            Expected tuples
 * @throws IOException
 * @throws InterruptedException
 */
protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples)
        throws IOException, InterruptedException {
    // Set up fake job
    InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
    Job job = Job.getInstance(config);
    job.setInputFormatClass(inputFormat.getClass());
    this.addInputPath(input, job.getConfiguration(), job);
    JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
    Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length);
    NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE);

    // Check splits
    List<InputSplit> splits = inputFormat.getSplits(context);
    Assert.assertEquals(expectedSplits, splits.size());

    // Check tuples
    for (InputSplit split : splits) {
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                new TaskAttemptID());
        RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
        reader.initialize(split, taskContext);
        this.checkTuples(reader, expectedTuples);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java

License:Apache License

/**
 * Runs a multiple input test// w ww.j a  va  2  s.c  o  m
 * 
 * @param inputs
 *            Inputs
 * @param expectedSplits
 *            Number of splits expected
 * @param expectedTuples
 *            Number of tuples expected
 * @throws IOException
 * @throws InterruptedException
 */
protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples)
        throws IOException, InterruptedException {
    // Prepare configuration and inputs
    Configuration config = this.prepareConfiguration();

    // Set up fake job
    InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
    Job job = Job.getInstance(config);
    job.setInputFormatClass(inputFormat.getClass());
    for (File input : inputs) {
        this.addInputPath(input, job.getConfiguration(), job);
    }
    JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
    Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);
    NLineInputFormat.setNumLinesPerSplit(job, expectedTuples);

    // Check splits
    List<InputSplit> splits = inputFormat.getSplits(context);
    Assert.assertEquals(expectedSplits, splits.size());

    // Check tuples
    int count = 0;
    for (InputSplit split : splits) {
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                new TaskAttemptID());
        RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
        reader.initialize(split, taskContext);
        count += this.countTuples(reader);
    }
    Assert.assertEquals(expectedTuples, count);
}

From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java

License:Apache License

protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits,
        int expectedTuples) throws IOException, InterruptedException {
    // Set up fake job
    InputFormat<LongWritable, T> inputFormat = this.getInputFormat();
    Job job = Job.getInstance(config);
    job.setInputFormatClass(inputFormat.getClass());
    for (File input : inputs) {
        this.addInputPath(input, job.getConfiguration(), job);
    }//from  www  .  java  2 s. com
    JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());
    Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length);

    // Check splits
    List<InputSplit> splits = inputFormat.getSplits(context);
    Assert.assertEquals(expectedSplits, splits.size());

    // Check tuples
    int count = 0;
    for (InputSplit split : splits) {
        // Validate split
        Assert.assertTrue(this.isValidSplit(split, config));

        // Read split
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                new TaskAttemptID());
        RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext);
        reader.initialize(split, taskContext);
        count += this.countTuples(reader);
    }
    Assert.assertEquals(expectedTuples, count);
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes with the same identity in a single
 * file, splits them over two files and checks that we can workaround
 * JENA-820 successfully by setting the//from w w  w  .  jav  a  2  s .c o  m
 * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job
 * 
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public final void blank_node_divergence_01() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);

        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));

        // Enabling this flag works around the JENA-820 issue
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should not have diverged
        Assert.assertEquals(1, nodes.size());

    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes with the same identity in a single
 * file, splits them over two files and shows that they diverge in the
 * subsequent job when the JENA-820 workaround is not enabled
 * /* w  w  w  .j  a v a  2 s.c om*/
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_divergence_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_divergence", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Two mentions of the same blank node in the same file
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(a, tuples);

        // Set up fake job which will process the file as a single split
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - each triple goes to a separate
            // output file
            // This is how we force multiple files to be produced
            int taskID = 1;
            while (reader.nextKeyValue()) {
                // Prepare the output writing
                OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
                TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                        createAttemptID(1, ++taskID, 1));
                RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
                writer.close(outputTaskContext);
            }
        }

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // As described in JENA-820 at this point the blank nodes are
        // consistent, however when we read them from different files they
        // by default get treated as different nodes and so the blank nodes
        // diverge which is incorrect and undesirable behaviour in
        // multi-stage pipelines. However it is the default behaviour
        // because when we start from external inputs we want them to be
        // file scoped.
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));

        // Make sure JENA-820 flag is disabled
        job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false);
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes should have diverged
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        deleteDirectory(intermediateOutputDir);
    }
}