List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader
public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.apache.ignite.internal.processors.hadoop.impl.v2.HadoopV2MapTask.java
License:Apache License
/** {@inheritDoc} */ @SuppressWarnings({ "ConstantConditions", "unchecked" }) @Override/*w w w.j av a2 s .c o m*/ public void run0(HadoopV2TaskContext taskCtx) throws IgniteCheckedException { OutputFormat outputFormat = null; Exception err = null; JobContextImpl jobCtx = taskCtx.jobContext(); if (taskCtx.taskInfo().hasMapperIndex()) HadoopMapperUtils.mapperIndex(taskCtx.taskInfo().mapperIndex()); else HadoopMapperUtils.clearMapperIndex(); try { HadoopV2Context hadoopCtx = hadoopContext(); InputSplit nativeSplit = hadoopCtx.getInputSplit(); if (nativeSplit == null) throw new IgniteCheckedException("Input split cannot be null."); InputFormat inFormat = ReflectionUtils.newInstance(jobCtx.getInputFormatClass(), hadoopCtx.getConfiguration()); RecordReader reader = inFormat.createRecordReader(nativeSplit, hadoopCtx); reader.initialize(nativeSplit, hadoopCtx); hadoopCtx.reader(reader); HadoopJobInfo jobInfo = taskCtx.job().info(); outputFormat = jobInfo.hasCombiner() || jobInfo.hasReducer() ? null : prepareWriter(jobCtx); Mapper mapper = ReflectionUtils.newInstance(jobCtx.getMapperClass(), hadoopCtx.getConfiguration()); try { mapper.run(new WrappedMapper().getMapContext(hadoopCtx)); taskCtx.onMapperFinished(); } finally { closeWriter(); } commit(outputFormat); } catch (InterruptedException e) { err = e; Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } catch (Exception e) { err = e; throw new IgniteCheckedException(e); } finally { HadoopMapperUtils.clearMapperIndex(); if (err != null) abort(outputFormat); } }
From source file:org.apache.ignite.internal.processors.hadoop.v2.GridHadoopV2MapTask.java
License:Apache License
/** {@inheritDoc} */ @SuppressWarnings({ "ConstantConditions", "unchecked" }) @Override/*from w w w .j av a2 s .co m*/ public void run0(GridHadoopV2TaskContext taskCtx) throws IgniteCheckedException { GridHadoopInputSplit split = info().inputSplit(); InputSplit nativeSplit; if (split instanceof GridHadoopFileBlock) { GridHadoopFileBlock block = (GridHadoopFileBlock) split; nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), null); } else nativeSplit = (InputSplit) taskCtx.getNativeSplit(split); assert nativeSplit != null; OutputFormat outputFormat = null; Exception err = null; JobContextImpl jobCtx = taskCtx.jobContext(); try { InputFormat inFormat = ReflectionUtils.newInstance(jobCtx.getInputFormatClass(), hadoopContext().getConfiguration()); RecordReader reader = inFormat.createRecordReader(nativeSplit, hadoopContext()); reader.initialize(nativeSplit, hadoopContext()); hadoopContext().reader(reader); GridHadoopJobInfo jobInfo = taskCtx.job().info(); outputFormat = jobInfo.hasCombiner() || jobInfo.hasReducer() ? null : prepareWriter(jobCtx); Mapper mapper = ReflectionUtils.newInstance(jobCtx.getMapperClass(), hadoopContext().getConfiguration()); try { mapper.run(new WrappedMapper().getMapContext(hadoopContext())); } finally { closeWriter(); } commit(outputFormat); } catch (InterruptedException e) { err = e; Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } catch (Exception e) { err = e; throw new IgniteCheckedException(e); } finally { if (err != null) abort(outputFormat); } }
From source file:org.apache.ignite.internal.processors.hadoop.v2.HadoopV2MapTask.java
License:Apache License
/** {@inheritDoc} */ @SuppressWarnings({ "ConstantConditions", "unchecked" }) @Override// w ww.jav a2s. c om public void run0(HadoopV2TaskContext taskCtx) throws IgniteCheckedException { HadoopInputSplit split = info().inputSplit(); InputSplit nativeSplit; if (split instanceof HadoopFileBlock) { HadoopFileBlock block = (HadoopFileBlock) split; nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), null); } else nativeSplit = (InputSplit) taskCtx.getNativeSplit(split); assert nativeSplit != null; OutputFormat outputFormat = null; Exception err = null; JobContextImpl jobCtx = taskCtx.jobContext(); try { InputFormat inFormat = ReflectionUtils.newInstance(jobCtx.getInputFormatClass(), hadoopContext().getConfiguration()); RecordReader reader = inFormat.createRecordReader(nativeSplit, hadoopContext()); reader.initialize(nativeSplit, hadoopContext()); hadoopContext().reader(reader); HadoopJobInfo jobInfo = taskCtx.job().info(); outputFormat = jobInfo.hasCombiner() || jobInfo.hasReducer() ? null : prepareWriter(jobCtx); Mapper mapper = ReflectionUtils.newInstance(jobCtx.getMapperClass(), hadoopContext().getConfiguration()); try { mapper.run(new WrappedMapper().getMapContext(hadoopContext())); } finally { closeWriter(); } commit(outputFormat); } catch (InterruptedException e) { err = e; Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } catch (Exception e) { err = e; throw new IgniteCheckedException(e); } finally { if (err != null) abort(outputFormat); } }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a test with a single input//from ww w . j a va 2 s . c o m * * @param config * Configuration * @param input * Input * @param expectedTuples * Expected tuples * @throws IOException * @throws InterruptedException */ protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); this.addInputPath(input, job.getConfiguration(), job); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); this.checkTuples(reader, expectedTuples); } }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a multiple input test/*from w ww . j a v a 2s . c o m*/ * * @param inputs * Inputs * @param expectedSplits * Number of splits expected * @param expectedTuples * Number of tuples expected * @throws IOException * @throws InterruptedException */ protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Prepare configuration and inputs Configuration config = this.prepareConfiguration(); // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, expectedTuples); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config);/*w w w .jav a 2 s .c o m*/ job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { // Validate split Assert.assertTrue(this.isValidSplit(split, config)); // Read split TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and checks that we can workaround * JENA-820 successfully by setting the/*from ww w . j av a 2s . com*/ * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job * * @throws IOException * @throws InterruptedException */ @Test public final void blank_node_divergence_01() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Enabling this flag works around the JENA-820 issue job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should not have diverged Assert.assertEquals(1, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and shows that they diverge in the * subsequent job when the JENA-820 workaround is not enabled * //from www . j a v a 2 s.c o m * @throws IOException * @throws InterruptedException */ @Test public void blank_node_divergence_02() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines. However it is the default behaviour // because when we start from external inputs we want them to be // file scoped. LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Make sure JENA-820 flag is disabled job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should have diverged Assert.assertEquals(2, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes in two different files and checks * that writing them to a single file does not conflate them * //w ww . jav a 2 s .co m * @throws IOException * @throws InterruptedException */ @Test public void blank_node_identity_01() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_identity", getInitialInputExtension()); File b = File.createTempFile("bnode_identity", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile(); try { // Prepare the input data // Different blank nodes in different files List<T> tuples = new ArrayList<>(); Node bnode1 = NodeFactory.createBlankNode(); Node bnode2 = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode1, pred, NodeFactory.createLiteral("first"))); writeTuples(a, tuples); tuples.clear(); tuples.add(createTuple(bnode2, pred, NodeFactory.createLiteral("second"))); writeTuples(b, tuples); // Set up fake job which will process the two files Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Prepare the output writing - putting all output to a single file OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - all triples go to a single // output while (reader.nextKeyValue()) { writer.write(reader.getCurrentKey(), reader.getCurrentValue()); } } writer.close(outputTaskContext); // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // The Blank nodes should have been given separate identities so we // should not be conflating them, this is the opposite problem to // that described in JENA-820 LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes must not have converged Assert.assertEquals(2, nodes.size()); } finally { a.delete(); b.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes in two different files and checks * that writing them to a single file does not conflate them * /*from w w w.j a va 2 s . c o m*/ * @throws IOException * @throws InterruptedException */ @Test public void blank_node_identity_02() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_identity", getInitialInputExtension()); File b = File.createTempFile("bnode_identity", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile(); try { // Prepare the input data // Same blank node but in different files so must be treated as // different blank nodes and not converge List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); writeTuples(a, tuples); tuples.clear(); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(b, tuples); // Set up fake job which will process the two files Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Prepare the output writing - putting all output to a single file OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - all triples go to a single // output while (reader.nextKeyValue()) { writer.write(reader.getCurrentKey(), reader.getCurrentValue()); } } writer.close(outputTaskContext); // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // The Blank nodes should have been given separate identities so we // should not be conflating them, this is the opposite problem to // that described in JENA-820 LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes must not diverge Assert.assertEquals(2, nodes.size()); } finally { a.delete(); b.delete(); deleteDirectory(intermediateOutputDir); } }