List of usage examples for org.apache.hadoop.mapreduce RecordReader getCurrentKey
public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;
From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java
License:Apache License
/** * Test that starts with two blank nodes in two different files and checks * that writing them to a single file does not conflate them * /* w w w. jav a 2 s . c o m*/ * @throws IOException * @throws InterruptedException */ @Test public void blank_node_identity_02() throws IOException, InterruptedException { Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile()); Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity()); // Temporary files File a = File.createTempFile("bnode_identity", getInitialInputExtension()); File b = File.createTempFile("bnode_identity", getInitialInputExtension()); File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile(); try { // Prepare the input data // Same blank node but in different files so must be treated as // different blank nodes and not converge List<T> tuples = new ArrayList<>(); Node bnode = NodeFactory.createBlankNode(); Node pred = NodeFactory.createURI("http://example.org/predicate"); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first"))); writeTuples(a, tuples); tuples.clear(); tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second"))); writeTuples(b, tuples); // Set up fake job which will process the two files Configuration config = new Configuration(true); InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat(); Job job = Job.getInstance(config); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath())); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath())); JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(2, splits.size()); // Prepare the output writing - putting all output to a single file OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat(); TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 2, 1)); RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext); for (InputSplit split : splits) { // Initialize the input reading TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), createAttemptID(1, 1, 1)); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); // Copy the input to the output - all triples go to a single // output while (reader.nextKeyValue()) { writer.write(reader.getCurrentKey(), reader.getCurrentValue()); } } writer.close(outputTaskContext); // Promote outputs from temporary status promoteInputs(intermediateOutputDir); // Now we need to create a subsequent job that reads the // intermediate outputs // The Blank nodes should have been given separate identities so we // should not be conflating them, this is the opposite problem to // that described in JENA-820 LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath()); job = Job.getInstance(config); inputFormat = createIntermediateInputFormat(); job.setInputFormatClass(inputFormat.getClass()); NLineInputFormat.setNumLinesPerSplit(job, 100); FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath())); context = new JobContextImpl(job.getConfiguration(), job.getJobID()); // Get the splits splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); // Expect to end up with a single blank node Set<Node> nodes = new HashSet<Node>(); for (InputSplit split : splits) { TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext); reader.initialize(split, inputTaskContext); while (reader.nextKeyValue()) { nodes.add(getSubject(reader.getCurrentValue().get())); } } // Nodes must not diverge Assert.assertEquals(2, nodes.size()); } finally { a.delete(); b.delete(); deleteDirectory(intermediateOutputDir); } }
From source file:org.apache.jena.tdbloader4.partitioners.SplitSampler.java
License:Apache License
/** * From each split sampled, take the first numSamples / numSplits records. *///from www . ja v a 2s. co m @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); int samplesPerSplit = numSamples / splitsToSample; log.debug("Sampling {} splits, taking {} samples per split", splitsToSample, samplesPerSplit); long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); InputSplit split = splits.get(i); log.debug("Sampling {} split", split); RecordReader<K, V> reader = inf.createRecordReader(split, samplingContext); reader.initialize(split, samplingContext); while (reader.nextKeyValue()) { LongQuadWritable currentKey = (LongQuadWritable) reader.getCurrentKey(); // TODO: why do we need to do that? Why on earth we have -1 in subject, predicate or object position??? if ((currentKey.get(0) > 0) && (currentKey.get(1) > 0) && (currentKey.get(2) > 0)) { LongQuadWritable key = new LongQuadWritable(currentKey.get(0), currentKey.get(1), currentKey.get(2), currentKey.get(3)); log.debug("Sampled {}", key); samples.add((K) key); ++records; if (records >= (i + 1) * samplesPerSplit) { log.debug("Records is {} and (i + 1) * samplesPerSplit is {}", records, (i + 1) * samplesPerSplit); break; } } } reader.close(); } return (K[]) samples.toArray(); }
From source file:org.apache.mahout.classifier.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);//w ww . j a v a 2 s .c o m Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); /* first instance id in hadoop's order */ //int[] firstIds = new int[nbSplits]; /* partitions' sizes in hadoop order */ int[] sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); //firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);//from w w w .j ava2 s . c o m Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); firstIds = new int[nbSplits]; sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (int p = 0; p < nbSplits; p++) { InputSplit split = splits.get(p); int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition/*from w ww. j a v a 2 s . c o m*/ */ protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback) throws IOException, InterruptedException { JobContext jobContext = new JobContext(conf, new JobID()); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(jobContext); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted); Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < nbSplits; p++) { total += Step2Mapper.nbConcerned(nbSplits, numTrees, p); } TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees); long slowest = 0; // duration of slowest map for (int partition = 0; partition < nbSplits; partition++) { InputSplit split = sorted[partition]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(conf); int numInstances = InterResults.load(fs, forestPath, nbSplits, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), secondOutput); } mapper.cleanup(secondOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted);/*from w ww . j a va2 s. com*/ Builder.sortSplits(sorted); Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(), NUM_MAPS); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); if (firstKey == null) { firstKey = key.get(); } mapper.map(key, reader.getCurrentValue(), context); size++; } mapper.cleanup(context); // validate the mapper's output assertEquals(p, context.keys[p]); assertEquals(firstKey.longValue(), context.values[p].getFirstId()); assertEquals(size, context.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < NUM_INSTANCES; index++) { source[index][labelId] = index;/*w w w. j a v a 2s . c om*/ } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted); Builder.sortSplits(sorted); List<Integer> keys = new ArrayList<Integer>(); List<Step0Output> values = new ArrayList<Step0Output>(); int[] expectedIds = new int[NUM_MAPS]; TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); Text value = reader.getCurrentValue(); if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).getLabel(); } size++; } keys.add(p); values.add(new Step0Output(firstKey, size)); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.rya.accumulo.mr.GraphXEdgeInputFormatTest.java
License:Apache License
@SuppressWarnings("rawtypes") @Test// w w w .jav a 2 s . c o m public void testInputFormat() throws Exception { RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")) .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")) .setColumnVisibility(new byte[0]).setValue(new byte[0]).build(); apiImpl.add(input); Job jobConf = Job.getInstance(); GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName()); GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password); GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO); GraphXEdgeInputFormat.setInputTableName(jobConf, table); GraphXEdgeInputFormat.setInputTableName(jobConf, table); GraphXEdgeInputFormat.setScanIsolation(jobConf, false); GraphXEdgeInputFormat.setLocalIterators(jobConf, false); GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false); GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat(); JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID()); List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1)); RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext); RecordReader ryaStatementRecordReader = (RecordReader) reader; ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext); List<Edge> results = new ArrayList<Edge>(); while (ryaStatementRecordReader.nextKeyValue()) { Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue(); long srcId = writable.srcId(); long destId = writable.dstId(); RyaTypeWritable rtw = null; Object text = ryaStatementRecordReader.getCurrentKey(); Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw); results.add(edge); System.out.println(text); } System.out.println(results.size()); System.out.println(results); Assert.assertTrue(results.size() == 2); }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.RecordReaderWriterTest.java
License:Apache License
private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration, final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass, final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass) throws Exception { final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration); final TaskAttemptContext job = new TaskAttemptContextImpl(configuration, new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0)); int vertexCount = 0; int outEdgeCount = 0; int inEdgeCount = 0; final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent() ? ReflectionUtils.newInstance(outFormatClass.get(), configuration) : null;//w ww. j av a 2 s . co m final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null : outputFormat.getRecordWriter(job); boolean foundKeyValue = false; for (final FileSplit split : fileSplits) { logger.info("\treading file split {}", split.getPath().getName() + " ({}", split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)"); final RecordReader reader = inputFormat.createRecordReader(split, job); float lastProgress = -1f; while (reader.nextKeyValue()) { //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue()); final float progress = reader.getProgress(); assertTrue(progress >= lastProgress); assertEquals(NullWritable.class, reader.getCurrentKey().getClass()); final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue(); if (null != writer) writer.write(NullWritable.get(), vertexWritable); vertexCount++; outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT)); inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN)); // final Vertex vertex = vertexWritable.get(); assertEquals(Integer.class, vertex.id().getClass()); if (vertex.value("name").equals("SUGAR MAGNOLIA")) { foundKeyValue = true; assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT))); assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN))); } lastProgress = progress; } } assertEquals(8049, outEdgeCount); assertEquals(8049, inEdgeCount); assertEquals(outEdgeCount, inEdgeCount); assertEquals(808, vertexCount); assertTrue(foundKeyValue); if (null != writer) { writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID())); for (int i = 1; i < 10; i++) { final File outputDirectory = new File( new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI()); final List<FileSplit> splits = generateFileSplits( new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/" + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0" + "/part-m-00000"), i); validateFileSplits(splits, configuration, inputFormatClass, Optional.empty()); } } }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);/* ww w.ja v a 2s . com*/ // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }