List of usage examples for org.apache.hadoop.mapreduce Job getJobID
public JobID getJobID()
From source file:org.apache.hcatalog.templeton.tool.TempletonControllerJob.java
License:Apache License
/** * Enqueue the job and print out the job id for later collection. *///from ww w .j a v a2s. c o m @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); conf.set(JAR_ARGS_NAME, TempletonUtils.encodeArray(args)); conf.set("user.name", UserGroupInformation.getCurrentUser().getShortUserName()); Job job = new Job(conf); job.setJarByClass(TempletonControllerJob.class); job.setJobName("TempletonControllerJob"); job.setMapperClass(LaunchMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(SingleInputFormat.class); NullOutputFormat<NullWritable, NullWritable> of = new NullOutputFormat<NullWritable, NullWritable>(); job.setOutputFormatClass(of.getClass()); job.setNumReduceTasks(0); JobClient jc = new JobClient(new JobConf(job.getConfiguration())); Token<DelegationTokenIdentifier> mrdt = jc.getDelegationToken(new Text("mr token")); job.getCredentials().addToken(new Text("mr token"), mrdt); job.submit(); submittedJobId = job.getJobID(); return 0; }
From source file:org.apache.hive.hcatalog.templeton.tool.TempletonControllerJob.java
License:Apache License
/** * Enqueue the job and print out the job id for later collection. * @see org.apache.hive.hcatalog.templeton.CompleteDelegator *//*from w ww .jav a 2 s .c om*/ @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException, TException { if (LOG.isDebugEnabled()) { LOG.debug("Preparing to submit job: " + Arrays.toString(args)); } Configuration conf = getConf(); conf.set(JAR_ARGS_NAME, TempletonUtils.encodeArray(args)); String memoryMb = appConf.mapperMemoryMb(); if (memoryMb != null && memoryMb.length() != 0) { conf.set(AppConfig.HADOOP_MAP_MEMORY_MB, memoryMb); } String amMemoryMB = appConf.amMemoryMb(); if (amMemoryMB != null && !amMemoryMB.isEmpty()) { conf.set(AppConfig.HADOOP_MR_AM_MEMORY_MB, amMemoryMB); } String amJavaOpts = appConf.controllerAMChildOpts(); if (amJavaOpts != null && !amJavaOpts.isEmpty()) { conf.set(AppConfig.HADOOP_MR_AM_JAVA_OPTS, amJavaOpts); } String user = UserGroupInformation.getCurrentUser().getShortUserName(); conf.set("user.name", user); Job job = new Job(conf); job.setJarByClass(LaunchMapper.class); job.setJobName(TempletonControllerJob.class.getSimpleName()); job.setMapperClass(LaunchMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(SingleInputFormat.class); NullOutputFormat<NullWritable, NullWritable> of = new NullOutputFormat<NullWritable, NullWritable>(); job.setOutputFormatClass(of.getClass()); job.setNumReduceTasks(0); JobClient jc = new JobClient(new JobConf(job.getConfiguration())); if (UserGroupInformation.isSecurityEnabled()) { Token<DelegationTokenIdentifier> mrdt = jc.getDelegationToken(new Text("mr token")); job.getCredentials().addToken(new Text("mr token"), mrdt); } String metastoreTokenStrForm = addHMSToken(job, user); job.submit(); submittedJobId = job.getJobID(); if (metastoreTokenStrForm != null) { //so that it can be cancelled later from CompleteDelegator DelegationTokenCache.getStringFormTokenCache().storeDelegationToken(submittedJobId.toString(), metastoreTokenStrForm); LOG.debug("Added metastore delegation token for jobId=" + submittedJobId.toString() + " user=" + user); } return 0; }
From source file:org.apache.ignite.client.hadoop.GridHadoopClientProtocolSelfTest.java
License:Apache License
/** * Test job submission./* www . j av a 2 s.c o m*/ * * @param noCombiners Whether there are no combiners. * @param noReducers Whether there are no reducers. * @throws Exception If failed. */ public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception { IgniteFs igfs = grid(0).fileSystem(GridHadoopAbstractSelfTest.igfsName); igfs.mkdirs(new IgfsPath(PATH_INPUT)); try (BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) { bw.write("word"); } Configuration conf = config(GridHadoopAbstractSelfTest.REST_PORT); final Job job = Job.getInstance(conf); job.setJobName(JOB_NAME); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(TestMapper.class); job.setReducerClass(TestReducer.class); if (!noCombiners) job.setCombinerClass(TestCombiner.class); if (noReducers) job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TestOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(PATH_INPUT)); FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT)); job.submit(); JobID jobId = job.getJobID(); // Setup phase. JobStatus jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f; assert jobStatus.getMapProgress() == 0.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); JobStatus recentJobStatus = job.getStatus(); assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old=" + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress(); // Transferring to map phase. setupLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getSetupProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); // Map phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old=" + jobStatus.getMapProgress() + ", new=" + recentJobStatus.getMapProgress(); // Transferring to reduce phase. mapLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getMapProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); if (!noReducers) { // Reduce phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f; // Ensure that reduces progress increases. U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old=" + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress(); reduceLockFile.delete(); } job.waitForCompletion(false); jobStatus = job.getStatus(); checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() == 1.0f; dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT)); }
From source file:org.apache.ignite.client.hadoop.HadoopClientProtocolSelfTest.java
License:Apache License
/** * Test job submission.// ww w .ja va 2s . co m * * @param noCombiners Whether there are no combiners. * @param noReducers Whether there are no reducers. * @throws Exception If failed. */ public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception { IgniteFileSystem igfs = grid(0).fileSystem(HadoopAbstractSelfTest.igfsName); igfs.mkdirs(new IgfsPath(PATH_INPUT)); try (BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) { bw.write("word"); } Configuration conf = config(HadoopAbstractSelfTest.REST_PORT); final Job job = Job.getInstance(conf); job.setJobName(JOB_NAME); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(TestMapper.class); job.setReducerClass(TestReducer.class); if (!noCombiners) job.setCombinerClass(TestCombiner.class); if (noReducers) job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TestOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(PATH_INPUT)); FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT)); job.submit(); JobID jobId = job.getJobID(); // Setup phase. JobStatus jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f; assert jobStatus.getMapProgress() == 0.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); JobStatus recentJobStatus = job.getStatus(); assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old=" + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress(); // Transferring to map phase. setupLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getSetupProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); // Map phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old=" + jobStatus.getMapProgress() + ", new=" + recentJobStatus.getMapProgress(); // Transferring to reduce phase. mapLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getMapProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); if (!noReducers) { // Reduce phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f; // Ensure that reduces progress increases. U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old=" + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress(); reduceLockFile.delete(); } job.waitForCompletion(false); jobStatus = job.getStatus(); checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() == 1.0f; dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT)); }
From source file:org.apache.ignite.internal.processors.hadoop.impl.client.HadoopClientProtocolSelfTest.java
License:Apache License
/** * Test job submission.//from w ww . ja v a 2 s. c o m * * @param noCombiners Whether there are no combiners. * @param noReducers Whether there are no reducers. * @throws Exception If failed. */ public void checkJobSubmit(boolean noCombiners, boolean noReducers) throws Exception { IgniteFileSystem igfs = grid(0).fileSystem(HadoopAbstractSelfTest.igfsName); igfs.mkdirs(new IgfsPath(PATH_INPUT)); try (BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(igfs.create(new IgfsPath(PATH_INPUT + "/test.file"), true)))) { bw.write("word"); } Configuration conf = config(HadoopAbstractSelfTest.REST_PORT); final Job job = Job.getInstance(conf); try { job.setJobName(JOB_NAME); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(TestMapper.class); job.setReducerClass(TestReducer.class); if (!noCombiners) job.setCombinerClass(TestCombiner.class); if (noReducers) job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TestOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(PATH_INPUT)); FileOutputFormat.setOutputPath(job, new Path(PATH_OUTPUT)); job.submit(); JobID jobId = job.getJobID(); // Setup phase. JobStatus jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() >= 0.0f && jobStatus.getSetupProgress() < 1.0f; assert jobStatus.getMapProgress() == 0.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); JobStatus recentJobStatus = job.getStatus(); assert recentJobStatus.getSetupProgress() > jobStatus.getSetupProgress() : "Old=" + jobStatus.getSetupProgress() + ", new=" + recentJobStatus.getSetupProgress(); // Transferring to map phase. setupLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getSetupProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); // Map phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() >= 0.0f && jobStatus.getMapProgress() < 1.0f; assert jobStatus.getReduceProgress() == 0.0f; U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getMapProgress() > jobStatus.getMapProgress() : "Old=" + jobStatus.getMapProgress() + ", new=" + recentJobStatus.getMapProgress(); // Transferring to reduce phase. mapLockFile.delete(); assert GridTestUtils.waitForCondition(new GridAbsPredicate() { @Override public boolean apply() { try { return F.eq(1.0f, job.getStatus().getMapProgress()); } catch (Exception e) { throw new RuntimeException("Unexpected exception.", e); } } }, 5000L); if (!noReducers) { // Reduce phase. jobStatus = job.getStatus(); checkJobStatus(jobStatus, jobId, JOB_NAME, JobStatus.State.RUNNING, 0.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() >= 0.0f && jobStatus.getReduceProgress() < 1.0f; // Ensure that reduces progress increases. U.sleep(2100); recentJobStatus = job.getStatus(); assert recentJobStatus.getReduceProgress() > jobStatus.getReduceProgress() : "Old=" + jobStatus.getReduceProgress() + ", new=" + recentJobStatus.getReduceProgress(); reduceLockFile.delete(); } job.waitForCompletion(false); jobStatus = job.getStatus(); checkJobStatus(job.getStatus(), jobId, JOB_NAME, JobStatus.State.SUCCEEDED, 1.0f); assert jobStatus.getSetupProgress() == 1.0f; assert jobStatus.getMapProgress() == 1.0f; assert jobStatus.getReduceProgress() == 1.0f; dumpIgfs(igfs, new IgfsPath(PATH_OUTPUT)); } finally { job.getCluster().close(); } }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a test with a single input/*w ww .java 2 s . c om*/ * * @param config * Configuration * @param input * Input * @param expectedTuples * Expected tuples * @throws IOException * @throws InterruptedException */ protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); this.addInputPath(input, job.getConfiguration(), job); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); this.checkTuples(reader, expectedTuples); } }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a multiple input test// w ww.j a va 2 s.c o m * * @param inputs * Inputs * @param expectedSplits * Number of splits expected * @param expectedTuples * Number of tuples expected * @throws IOException * @throws InterruptedException */ protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Prepare configuration and inputs Configuration config = this.prepareConfiguration(); // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, expectedTuples); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); }//from www . java 2 s. com JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { // Validate split Assert.assertTrue(this.isValidSplit(split, config)); // Read split TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and checks that we can workaround * JENA-820 successfully by setting the//from w w w . jav a 2 s .c o m * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job * * @throws IOException * @throws InterruptedException */ @Test public final void blank_node_divergence_01() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Enabling this flag works around the JENA-820 issue job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should not have diverged Assert.assertEquals(1, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and shows that they diverge in the * subsequent job when the JENA-820 workaround is not enabled * /* w w w .j a v a 2 s.c om*/ * @throws IOException * @throws InterruptedException */ @Test public void blank_node_divergence_02() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines. However it is the default behaviour // because when we start from external inputs we want them to be // file scoped. LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Make sure JENA-820 flag is disabled job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should have diverged Assert.assertEquals(2, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }