List of usage examples for org.apache.hadoop.io LongWritable get
public long get()
From source file:org.apache.tez.mapreduce.processor.MapUtils.java
License:Apache License
private static InputSplit createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file) throws IOException { FileInputFormat.setInputPaths(job, workDir); LOG.info("Generating data at path: " + file); // create a file with length entries @SuppressWarnings("deprecation") SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class, Text.class); try {/* w ww. ja v a 2 s. c o m*/ Random r = new Random(System.currentTimeMillis()); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 10; i > 0; i--) { key.set(r.nextInt(1000)); value.set(Integer.toString(i)); writer.append(key, value); LOG.info("<k, v> : <" + key.get() + ", " + value + ">"); } } finally { writer.close(); } SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(job, 1); System.err.println("#split = " + splits.length + " ; " + "#locs = " + splits[0].getLocations().length + "; " + "loc = " + splits[0].getLocations()[0] + "; " + "off = " + splits[0].getLength() + "; " + "file = " + ((FileSplit) splits[0]).getPath()); return splits[0]; }
From source file:org.apache.tez.mapreduce.processor.reduce.TestReduceProcessor.java
License:Apache License
@Test(timeout = 5000) public void testReduceProcessor() throws Exception { final String dagName = "mrdag0"; String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName(); String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName(); JobConf jobConf = new JobConf(defaultConf); setUpJobConf(jobConf);//from www . j a v a2 s.c om MRHelpers.translateMRConfToTez(jobConf); jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString()); jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false); Path mapInput = new Path(workDir, "map0"); MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput); InputSpec mapInputSpec = new InputSpec("NullSrcVertex", InputDescriptor.create(MRInputLegacy.class.getName()) .setUserPayload(UserPayload.create(ByteBuffer.wrap(MRRuntimeProtos.MRInputUserPayloadProto .newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)) .build().toByteArray()))), 1); OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex", OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); // Run a map TestUmbilical testUmbilical = new TestUmbilical(); LogicalIOProcessorRuntimeTask mapTask = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0, mapInput, testUmbilical, dagName, mapVertexName, Collections.singletonList(mapInputSpec), Collections.singletonList(mapOutputSpec)); mapTask.initialize(); mapTask.run(); mapTask.close(); // One VME, One DME Assert.assertEquals(2, testUmbilical.getEvents().size()); Assert.assertEquals(EventType.VERTEX_MANAGER_EVENT, testUmbilical.getEvents().get(0).getEventType()); Assert.assertEquals(EventType.COMPOSITE_DATA_MOVEMENT_EVENT, testUmbilical.getEvents().get(1).getEventType()); CompositeDataMovementEvent cdmEvent = (CompositeDataMovementEvent) testUmbilical.getEvents().get(1) .getEvent(); Assert.assertEquals(1, cdmEvent.getCount()); DataMovementEvent dme = cdmEvent.getEvents().iterator().next(); dme.setTargetIndex(0); LOG.info("Starting reduce..."); JobTokenIdentifier identifier = new JobTokenIdentifier(new Text(dagName)); JobTokenSecretManager jobTokenSecretManager = new JobTokenSecretManager(); Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(identifier, jobTokenSecretManager); shuffleToken.setService(identifier.getJobId()); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString()); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_OPTIMIZE_LOCAL_FETCH, true); FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output")); ProcessorDescriptor reduceProcessorDesc = ProcessorDescriptor.create(ReduceProcessor.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)); InputSpec reduceInputSpec = new InputSpec(mapVertexName, InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); OutputSpec reduceOutputSpec = new OutputSpec("NullDestinationVertex", OutputDescriptor .create(MROutputLegacy.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); // Now run a reduce TaskSpec taskSpec = new TaskSpec(TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0), dagName, reduceVertexName, -1, reduceProcessorDesc, Collections.singletonList(reduceInputSpec), Collections.singletonList(reduceOutputSpec), null); Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>(); serviceConsumerMetadata.put(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, ShuffleUtils.convertJobTokenToBytes(shuffleToken)); Map<String, String> serviceProviderEnvMap = new HashMap<String, String>(); ByteBuffer shufflePortBb = ByteBuffer.allocate(4).putInt(0, 8000); AuxiliaryServiceHelper.setServiceDataIntoEnv(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, shufflePortBb, serviceProviderEnvMap); LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(taskSpec, 0, jobConf, new String[] { workDir.toString() }, new TestUmbilical(), serviceConsumerMetadata, serviceProviderEnvMap, HashMultimap.<String, String>create(), null, "", new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory()); List<Event> destEvents = new LinkedList<Event>(); destEvents.add(dme); task.initialize(); OrderedGroupedInputLegacy sortedOut = (OrderedGroupedInputLegacy) task.getInputs().values().iterator() .next(); sortedOut.handleEvents(destEvents); task.run(); task.close(); // MRTask mrTask = (MRTask)t.getProcessor(); // TODO NEWTEZ Verify the partitioner has not been created // Likely not applicable anymore. // Assert.assertNull(mrTask.getPartitioner()); // Only a task commit happens, hence the data is still in the temporary directory. Path reduceOutputDir = new Path(new Path(workDir, "output"), "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0))); Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf); LongWritable key = new LongWritable(); Text value = new Text(); long prev = Long.MIN_VALUE; while (reader.next(key, value)) { if (prev != Long.MIN_VALUE) { Assert.assertTrue(prev < key.get()); prev = key.get(); } } reader.close(); }
From source file:org.apache.tez.runtime.library.common.sort.impl.TestTezMerger.java
License:Apache License
private void merge(List<Path> pathList, int mergeFactor, RawComparator rc) throws Exception { //Merge datasets TezMerger merger = new TezMerger(); TezRawKeyValueIterator records = merger.merge(defaultConf, localFs, IntWritable.class, LongWritable.class, null, false, 0, 1024, pathList.toArray(new Path[pathList.size()]), true, mergeFactor, new Path(workDir, "tmp_" + System.nanoTime()), ((rc == null) ? comparator : rc), new Reporter(), null, null, null, new Progress()); //Verify the merged data is correct Map<Integer, Integer> dataMap = Maps.newHashMap(); int pk = -1;//ww w .ja va2s . c o m while (records.next()) { DataInputBuffer key = records.getKey(); DataInputBuffer value = records.getValue(); IntWritable k = new IntWritable(); k.readFields(key); LongWritable v = new LongWritable(); v.readFields(value); if (records.isSameKey()) { LOG.info("\tSame Key : key=" + k.get() + ", val=" + v.get()); //More than one key should be present in the source data assertTrue(verificationDataSet.get(k.get()).size() > 1); //Ensure this is same as the previous key we saw assertTrue("previousKey=" + pk + ", current=" + k.get(), pk == k.get()); } else { LOG.info("key=" + k.get() + ", val=" + v.get()); } pk = k.get(); int keyCount = (dataMap.containsKey(k.get())) ? (dataMap.get(k.get()) + 1) : 1; dataMap.put(k.get(), keyCount); } //Verify if the number of distinct entries is the same in source and the test assertTrue( "dataMap=" + dataMap.keySet().size() + ", verificationSet=" + verificationDataSet.keySet().size(), dataMap.keySet().size() == verificationDataSet.keySet().size()); //Verify with source data for (Integer key : verificationDataSet.keySet()) { assertTrue( "Data size for " + key + " not matching with source; dataSize:" + dataMap.get(key).intValue() + ", source:" + verificationDataSet.get(key).size(), dataMap.get(key).intValue() == verificationDataSet.get(key).size()); } //Verify if every key has the same number of repeated items in the source dataset as well for (Map.Entry<Integer, Integer> entry : dataMap.entrySet()) { assertTrue(entry.getKey() + "", verificationDataSet.get(entry.getKey()).size() == entry.getValue()); } LOG.info("******************"); verificationDataSet.clear(); }
From source file:org.apache.tez.runtime.library.common.sort.impl.TestTezMerger.java
License:Apache License
/** * Generate data set for ifile. Create repeated keys if needed. * * @param keyCount approximate number of keys to be created * @param repeatCount number of times a key should be repeated * @return/* ww w. ja v a 2 s . c o m*/ */ static TreeMultimap<Integer, Long> createDataForIFile(int keyCount, int repeatCount) { TreeMultimap<Integer, Long> dataSet = TreeMultimap.create(); Random rnd = new Random(); for (int i = 0; i < keyCount; i++) { if (repeatCount > 0 && (rnd.nextInt(keyCount) % 2 == 0)) { //repeat this key for (int j = 0; j < repeatCount; j++) { IntWritable key = new IntWritable(rnd.nextInt(keyCount)); LongWritable value = new LongWritable(System.nanoTime()); dataSet.put(key.get(), value.get()); } i += repeatCount; LOG.info("Repeated key count=" + (repeatCount)); } else { IntWritable key = new IntWritable(rnd.nextInt(keyCount)); LongWritable value = new LongWritable(System.nanoTime()); dataSet.put(key.get(), value.get()); } } for (Integer key : dataSet.keySet()) { for (Long value : dataSet.get(key)) { LOG.info("Key=" + key + ", val=" + value); } } LOG.info("============="); return dataSet; }
From source file:org.apache.tez.runtime.library.common.writers.TestUnorderedPartitionedKVWriter.java
License:Apache License
private void baseTest(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress) throws IOException, InterruptedException { PartitionerForTest partitioner = new PartitionerForTest(); ApplicationId appId = ApplicationId.newInstance(10000, 1); TezCounters counters = new TezCounters(); String uniqueId = UUID.randomUUID().toString(); OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId); Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, -1);/* w w w. j ava2 s. co m*/ CompressionCodec codec = null; if (shouldCompress) { codec = new DefaultCodec(); ((Configurable) codec).setConf(conf); } int numOutputs = numPartitions; long availableMemory = 2048; int numRecordsWritten = 0; Map<Integer, Multimap<Integer, Long>> expectedValues = new HashMap<Integer, Multimap<Integer, Long>>(); for (int i = 0; i < numOutputs; i++) { expectedValues.put(i, LinkedListMultimap.<Integer, Long>create()); } UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory); int sizePerBuffer = kvWriter.sizePerBuffer; int sizePerRecord = 4 + 8; // IntW + LongW int sizePerRecordWithOverhead = sizePerRecord + 12; // Record + META_OVERHEAD IntWritable intWritable = new IntWritable(); LongWritable longWritable = new LongWritable(); for (int i = 0; i < numRecords; i++) { intWritable.set(i); longWritable.set(i); int partition = partitioner.getPartition(intWritable, longWritable, numOutputs); if (skippedPartitions != null && skippedPartitions.contains(partition)) { continue; } expectedValues.get(partition).put(intWritable.get(), longWritable.get()); kvWriter.write(intWritable, longWritable); numRecordsWritten++; } List<Event> events = kvWriter.close(); int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead; int numExpectedSpills = numRecordsWritten / recordsPerBuffer; verify(outputContext, never()).fatalError(any(Throwable.class), any(String.class)); // Verify the status of the buffers if (numExpectedSpills == 0) { assertEquals(1, kvWriter.numInitializedBuffers); } else { assertTrue(kvWriter.numInitializedBuffers > 1); } assertNull(kvWriter.currentBuffer); assertEquals(0, kvWriter.availableBuffers.size()); // Verify the counters TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES); TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS); TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD); TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL); TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS); TezCounter additionalSpillBytesWritternCounter = counters .findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN); TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ); TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT); assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue()); assertEquals(numRecordsWritten, outputRecordsCounter.getValue()); assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue()); long fileOutputBytes = fileOutputBytesCounter.getValue(); if (numRecordsWritten > 0) { assertTrue(fileOutputBytes > 0); if (!shouldCompress) { assertTrue(fileOutputBytes > outputRecordBytesCounter.getValue()); } } else { assertEquals(0, fileOutputBytes); } assertEquals(recordsPerBuffer * numExpectedSpills, spilledRecordsCounter.getValue()); long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue(); long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue(); if (numExpectedSpills == 0) { assertEquals(0, additionalSpillBytesWritten); assertEquals(0, additionalSpillBytesRead); } else { assertTrue(additionalSpillBytesWritten > 0); assertTrue(additionalSpillBytesRead > 0); if (!shouldCompress) { assertTrue(additionalSpillBytesWritten > (recordsPerBuffer * numExpectedSpills * sizePerRecord)); assertTrue(additionalSpillBytesRead > (recordsPerBuffer * numExpectedSpills * sizePerRecord)); } } assertTrue(additionalSpillBytesWritten == additionalSpillBytesRead); assertEquals(numExpectedSpills, numAdditionalSpillsCounter.getValue()); BitSet emptyPartitionBits = null; // Verify the event returned assertEquals(1, events.size()); assertTrue(events.get(0) instanceof CompositeDataMovementEvent); CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(0); assertEquals(0, cdme.getSourceIndexStart()); assertEquals(numOutputs, cdme.getCount()); DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto .parseFrom(ByteString.copyFrom(cdme.getUserPayload())); assertFalse(eventProto.hasData()); if (skippedPartitions == null && numRecordsWritten > 0) { assertFalse(eventProto.hasEmptyPartitions()); emptyPartitionBits = new BitSet(numPartitions); } else { assertTrue(eventProto.hasEmptyPartitions()); byte[] emptyPartitions = TezCommonUtils .decompressByteStringToByteArray(eventProto.getEmptyPartitions()); emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions); if (numRecordsWritten == 0) { assertEquals(numPartitions, emptyPartitionBits.cardinality()); } else { for (Integer e : skippedPartitions) { assertTrue(emptyPartitionBits.get(e)); } assertEquals(skippedPartitions.size(), emptyPartitionBits.cardinality()); } } if (emptyPartitionBits.cardinality() != numPartitions) { assertEquals(HOST_STRING, eventProto.getHost()); assertEquals(SHUFFLE_PORT, eventProto.getPort()); assertEquals(uniqueId, eventProto.getPathComponent()); } else { assertFalse(eventProto.hasHost()); assertFalse(eventProto.hasPort()); assertFalse(eventProto.hasPathComponent()); } // Verify the actual data TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId); Path outputFilePath = kvWriter.finalOutPath; Path spillFilePath = kvWriter.finalIndexPath; if (numRecordsWritten > 0) { assertTrue(localFs.exists(outputFilePath)); assertTrue(localFs.exists(spillFilePath)); } else { return; } // Special case for 0 records. TezSpillRecord spillRecord = new TezSpillRecord(spillFilePath, conf); DataInputBuffer keyBuffer = new DataInputBuffer(); DataInputBuffer valBuffer = new DataInputBuffer(); IntWritable keyDeser = new IntWritable(); LongWritable valDeser = new LongWritable(); for (int i = 0; i < numOutputs; i++) { if (skippedPartitions != null && skippedPartitions.contains(i)) { continue; } TezIndexRecord indexRecord = spillRecord.getIndex(i); FSDataInputStream inStream = FileSystem.getLocal(conf).open(outputFilePath); inStream.seek(indexRecord.getStartOffset()); IFile.Reader reader = new IFile.Reader(inStream, indexRecord.getPartLength(), codec, null, null, false, 0, -1); while (reader.nextRawKey(keyBuffer)) { reader.nextRawValue(valBuffer); keyDeser.readFields(keyBuffer); valDeser.readFields(valBuffer); int partition = partitioner.getPartition(keyDeser, valDeser, numOutputs); assertTrue(expectedValues.get(partition).remove(keyDeser.get(), valDeser.get())); } inStream.close(); } for (int i = 0; i < numOutputs; i++) { assertEquals(0, expectedValues.get(i).size()); expectedValues.remove(i); } assertEquals(0, expectedValues.size()); }
From source file:org.bgi.flexlab.gaea.tools.mapreduce.vcf.sort.VCFSort.java
License:Open Source License
@Override protected void reduce(LongWritable key, Iterable<VariantContextWritable> records, Reducer<LongWritable, VariantContextWritable, NullWritable, VariantContextWritable>.Context ctx) throws IOException, InterruptedException { int id = (int) (key.get() >> 40); for (VariantContextWritable rec : records) //NullWirtable.get()null; mos.write(multiOutputs.get(id), NullWritable.get(), rec); }
From source file:org.bgi.flexlab.gaea.tools.mapreduce.vcfqualitycontrol.variantrecalibratioin.VariantRecalibrationMapper.java
License:Open Source License
@Override public void map(LongWritable key, VariantContextWritable value, Context context) throws IOException, InterruptedException { VariantContext vc = value.get();/* w ww .jav a 2 s . com*/ if (!validContext(vc)) return; VariantDatumMessenger datum = new VariantDatumMessenger.Builder(manager, vc, options).decodeAnnotations() .setLoc(genomeLocParser).setOriginalQual().setFlagV().setPrior().build(); if (datum != null) { context.write(new IntWritable((int) key.get()), new Text(datum.toString())); } }
From source file:org.commoncrawl.util.JoinValue.java
License:Open Source License
public JoinValue(TextBytes tag, LongWritable value) { _tag = tag; _type = LONG_TYPE_JOIN_VALUE; _longValue = value.get(); }
From source file:org.datavec.hadoop.records.reader.mapfile.index.LongIndexToKey.java
License:Apache License
@Override public List<Pair<Long, Long>> initialize(MapFile.Reader[] readers, Class<? extends Writable> valueClass) throws IOException { List<Pair<Long, Long>> l = new ArrayList<>(readers.length); for (MapFile.Reader r : readers) { //Get the first and last keys: long first = -1; long last = -1; //First key: no method for this for some inexplicable reason :/ LongWritable k = new LongWritable(); Writable v = ReflectionUtils.newInstance(valueClass, null); boolean hasNext = r.next(k, v); if (!hasNext) { //This map file is empty - no data l.add(new Pair<>(-1L, -1L)); continue; }//from w w w . ja v a 2s . c om first = k.get(); //Last key: easy r.reset(); r.finalKey(k); last = k.get(); l.add(new Pair<>(first, last)); } //Check that things are actually contiguous: List<Pair<Long, Long>> sorted = new ArrayList<>(l.size()); for (Pair<Long, Long> p : l) { if (p.getLeft() >= 0) { sorted.add(p); } } Collections.sort(sorted, new Comparator<Pair<Long, Long>>() { @Override public int compare(Pair<Long, Long> o1, Pair<Long, Long> o2) { return Long.compare(o1.getFirst(), o2.getFirst()); } }); if (sorted.size() == 0) { throw new IllegalStateException("Map file is empty - no data available"); } if (sorted.get(0).getFirst() != 0L) { throw new UnsupportedOperationException("Minimum key value is not 0: got " + sorted.get(0).getFirst()); } for (int i = 0; i < sorted.size() - 1; i++) { long currLast = sorted.get(i).getSecond(); long nextFirst = sorted.get(i + 1).getFirst(); if (nextFirst == -1) { //Skip empty map file continue; } if (currLast + 1 != nextFirst) { throw new IllegalStateException( "Keys are not contiguous between readers: first/last indices (inclusive) " + "are " + sorted + ".\n LongIndexKey assumes unique and contiguous LongWritable keys"); } } readerIndices = l; return readerIndices; }
From source file:org.eobjects.hadoopdatacleaner.mapreduce.flatfile.FlatFileMapper.java
License:Open Source License
@Override public void map(LongWritable key, Text csvLine, final Context context) throws IOException, InterruptedException { if (key.get() == 0L) { context.getConfiguration().set("csv.header.line", csvLine.toString()); csvParser.parseHeaderRow(csvLine); } else {/*from w w w . ja v a 2 s . com*/ while (context.getConfiguration().get("csv.header.line") == null) { // Wait for the header to be read. } InputRow inputRow = csvParser.prepareRow(csvLine); Callback mapperEmitterCallback = new MapperEmitter.Callback() { public void write(Text key, SortedMapWritable row) throws IOException, InterruptedException { context.write(key, row); } }; mapperDelegate.run(inputRow, mapperEmitterCallback); } }