List of usage examples for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl
public TaskAttemptContextImpl(Configuration conf, TaskAttemptID taskId)
From source file:org.apache.hadoop.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample/*from www . j ava 2 s. c o m*/ * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min( conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a test with a single input//ww w. ja v a 2s . c om * * @param config * Configuration * @param input * Input * @param expectedTuples * Expected tuples * @throws IOException * @throws InterruptedException */ protected final void testSingleInput(Configuration config, File input, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); this.addInputPath(input, job.getConfiguration(), job); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(1, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, LARGE_SIZE); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); this.checkTuples(reader, expectedTuples); } }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
/** * Runs a multiple input test//from ww w.j av a 2 s . c o m * * @param inputs * Inputs * @param expectedSplits * Number of splits expected * @param expectedTuples * Number of tuples expected * @throws IOException * @throws InterruptedException */ protected final void testMultipleInputs(File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Prepare configuration and inputs Configuration config = this.prepareConfiguration(); // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); NLineInputFormat.setNumLinesPerSplit(job, expectedTuples); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java
License:Apache License
protected final void testSplitInputs(Configuration config, File[] inputs, int expectedSplits, int expectedTuples) throws IOException, InterruptedException { // Set up fake job InputFormat<LongWritable, T> inputFormat = this.getInputFormat(); Job job = Job.getInstance(config);//from w ww . j a va 2s . c o m job.setInputFormatClass(inputFormat.getClass()); for (File input : inputs) { this.addInputPath(input, job.getConfiguration(), job); } JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertEquals(inputs.length, FileInputFormat.getInputPaths(context).length); // Check splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(expectedSplits, splits.size()); // Check tuples int count = 0; for (InputSplit split : splits) { // Validate split Assert.assertTrue(this.isValidSplit(split, config)); // Read split TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, T> reader = inputFormat.createRecordReader(split, taskContext); reader.initialize(split, taskContext); count += this.countTuples(reader); } Assert.assertEquals(expectedTuples, count); }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and checks that we can workaround * JENA-820 successfully by setting the/*from w w w . ja v a 2 s.c o m*/ * {@link RdfIOConstants#GLOBAL_BNODE_IDENTITY} flag for our subsequent job * * @throws IOException * @throws InterruptedException */ @Test public final void blank_node_divergence_01() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Enabling this flag works around the JENA-820 issue job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, true); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should not have diverged Assert.assertEquals(1, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes with the same identity in a single * file, splits them over two files and shows that they diverge in the * subsequent job when the JENA-820 workaround is not enabled * /*w w w. j av a 2s. c o m*/ * @throws IOException * @throws InterruptedException */ @Test public void blank_node_divergence_02() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_divergence", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_divergence", new FileAttribute[0]).toFile(); try { // Prepare the input data // Two mentions of the same blank node in the same file List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(a, tuples); // Set up fake job which will process the file as a single split Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - each triple goes to a separate // output file // This is how we force multiple files to be produced int taskID = 1; while (reader.nextKeyValue()) { // Prepare the output writing OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, ++taskID, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); writer.write(reader.getCurrentKey(), reader.getCurrentValue()); writer.close(outputTaskContext); } } // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // As described in JENA-820 at this point the blank nodes are // consistent, however when we read them from different files they // by default get treated as different nodes and so the blank nodes // diverge which is incorrect and undesirable behaviour in // multi-stage pipelines. However it is the default behaviour // because when we start from external inputs we want them to be // file scoped. LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); // Make sure JENA-820 flag is disabled job.getConfiguration().setBoolean(RdfIOConstants.GLOBAL_BNODE_IDENTITY, false); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes should have diverged Assert.assertEquals(2, nodes.size()); } finally { a.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes in two different files and checks * that writing them to a single file does not conflate them * //from w ww . j av a2 s . c om * @throws IOException * @throws InterruptedException */ @Test public void blank_node_identity_01() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_identity", getInitialInputExtension()); File b = File.createTempFile("bnode_identity", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile(); try { // Prepare the input data // Different blank nodes in different files List<T> tuples = new ArrayList<>(); Node bnode1 = NodeFactory.createBlankNode(); Node bnode2 = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode1, pred, NodeFactory.createLiteral("first"))); writeTuples(a, tuples); tuples.clear(); tuples.add(createTuple(bnode2, pred, NodeFactory.createLiteral("second"))); writeTuples(b, tuples); // Set up fake job which will process the two files Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Prepare the output writing - putting all output to a single file OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - all triples go to a single // output while (reader.nextKeyValue()) { writer.write(reader.getCurrentKey(), reader.getCurrentValue()); } } writer.close(outputTaskContext); // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // The Blank nodes should have been given separate identities so we // should not be conflating them, this is the opposite problem to // that described in JENA-820 LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes must not have converged Assert.assertEquals(2, nodes.size()); } finally { a.delete(); b.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes in two different files and checks * that writing them to a single file does not conflate them * //from w w w .ja va 2s. c om * @throws IOException * @throws InterruptedException */ @Test public void blank_node_identity_02() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_identity", getInitialInputExtension()); File b = File.createTempFile("bnode_identity", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile(); try { // Prepare the input data // Same blank node but in different files so must be treated as // different blank nodes and not converge List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); writeTuples(a, tuples); tuples.clear(); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(b, tuples); // Set up fake job which will process the two files Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Prepare the output writing - putting all output to a single file OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - all triples go to a single // output while (reader.nextKeyValue()) { writer.write(reader.getCurrentKey(), reader.getCurrentValue()); } } writer.close(outputTaskContext); // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // The Blank nodes should have been given separate identities so we // should not be conflating them, this is the opposite problem to // that described in JENA-820 LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes must not diverge Assert.assertEquals(2, nodes.size()); } finally { a.delete(); b.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.hadoop.rdf.io.output.AbstractNodeTupleOutputFormatTests.java
License:Apache License
/** * Tests output//w w w . ja v a2 s. c om * * @param f * File to output to * @param num * Number of tuples to output * @throws IOException * @throws InterruptedException */ protected final void testOutput(File f, int num) throws IOException, InterruptedException { // Prepare configuration Configuration config = this.prepareConfiguration(); // Set up fake job OutputFormat<NullWritable, T> outputFormat = this.getOutputFormat(); Job job = Job.getInstance(config); job.setOutputFormatClass(outputFormat.getClass()); this.addOutputPath(f, job.getConfiguration(), job); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); Assert.assertNotNull(FileOutputFormat.getOutputPath(context)); // Output the data TaskAttemptID id = new TaskAttemptID("outputTest", 1, TaskType.MAP, 1, 1); TaskAttemptContext taskContext = new TaskAttemptContextImpl(job.getConfiguration(), id); RecordWriter<NullWritable, T> writer = outputFormat.getRecordWriter(taskContext); Iterator<T> tuples = this.generateTuples(num); while (tuples.hasNext()) { writer.write(NullWritable.get(), tuples.next()); } writer.close(taskContext); // Check output File outputFile = this.findOutputFile(this.folder.getRoot(), context); Assert.assertNotNull(outputFile); this.checkTuples(outputFile, num); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java
License:Apache License
@BeforeClass public void setUp() throws IOException { m_workdir = new Path(System.getProperty("test.tmp.dir", DEFAULT_WORK_DIR)); m_conf = new JobConf(); m_rand = Utils.createRandom();//from w ww. ja v a2s .co m m_partfns = new ArrayList<String>(); try { m_fs = FileSystem.getLocal(m_conf).getRaw(); m_fs.delete(m_workdir, true); m_fs.mkdirs(m_workdir); } catch (IOException e) { throw new IllegalStateException("bad fs init", e); } m_taid = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0); m_tacontext = new TaskAttemptContextImpl(m_conf, m_taid); MneConfigHelper.setDir(m_conf, MneConfigHelper.DEFAULT_OUTPUT_CONFIG_PREFIX, m_workdir.toString()); MneConfigHelper.setBaseOutputName(m_conf, null, "buffer-data"); MneConfigHelper.setMemServiceName(m_conf, MneConfigHelper.DEFAULT_INPUT_CONFIG_PREFIX, SERVICE_NAME); MneConfigHelper.setSlotKeyId(m_conf, MneConfigHelper.DEFAULT_INPUT_CONFIG_PREFIX, SLOT_KEY_ID); MneConfigHelper.setDurableTypes(m_conf, MneConfigHelper.DEFAULT_INPUT_CONFIG_PREFIX, new DurableType[] { DurableType.BUFFER }); MneConfigHelper.setEntityFactoryProxies(m_conf, MneConfigHelper.DEFAULT_INPUT_CONFIG_PREFIX, new Class<?>[] {}); MneConfigHelper.setMemServiceName(m_conf, MneConfigHelper.DEFAULT_OUTPUT_CONFIG_PREFIX, SERVICE_NAME); MneConfigHelper.setSlotKeyId(m_conf, MneConfigHelper.DEFAULT_OUTPUT_CONFIG_PREFIX, SLOT_KEY_ID); MneConfigHelper.setMemPoolSize(m_conf, MneConfigHelper.DEFAULT_OUTPUT_CONFIG_PREFIX, 1024L * 1024 * 1024 * 4); MneConfigHelper.setDurableTypes(m_conf, MneConfigHelper.DEFAULT_OUTPUT_CONFIG_PREFIX, new DurableType[] { DurableType.BUFFER }); MneConfigHelper.setEntityFactoryProxies(m_conf, MneConfigHelper.DEFAULT_OUTPUT_CONFIG_PREFIX, new Class<?>[] {}); }