List of usage examples for org.apache.hadoop.io ArrayWritable get
public Writable[] get()
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
public void testReader(boolean partitioned) throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String baseInstant = "100"; File partitionDir = partitioned ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant) : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant); InputFormatTestUtil.commit(basePath, baseInstant); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>(); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1)); logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2)); // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3)); FileSlice fileSlice = new FileSlice( partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()), new Path(partitionDir.getAbsolutePath())) : "default", baseInstant, "fileid0"); logVersionsWithAction.stream().forEach(logVersionWithAction -> { try {//from w ww. ja v a 2 s.c o m // update files or generate new log file int logVersion = logVersionWithAction.getRight(); String action = logVersionWithAction.getKey(); int baseInstantTs = Integer.parseInt(baseInstant); String instantTime = String.valueOf(baseInstantTs + logVersion); String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION) ? String.valueOf(baseInstantTs + logVersion - 2) : instantTime; HoodieLogFormat.Writer writer = null; if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) { writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime, String.valueOf(baseInstantTs + logVersion - 1), logVersion); } else { writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100, 0, logVersion); } long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); //create a split with baseFile (parquet file written earlier) and new log file(s) fileSlice.addLogFile(writer.getLogFile()); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()) .map(h -> h.getPath().toString()).collect(Collectors.toList()), instantTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())) .collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); if (partitioned) { jobConf.set("partition_columns", "datestr"); } //validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); //use reader to read base Parquet File and log file, merge in flight and return latest commit //here all 100 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); while (recordReader.next(key, value)) { Writable[] values = value.get(); //check if the record written is with latest commit, here "101" Assert.assertEquals(latestInstant, values[0].toString()); key = recordReader.createKey(); value = recordReader.createValue(); } } catch (Exception ioe) { throw new HoodieException(ioe.getMessage(), ioe); } }); // Add Rollback last version to next log-file }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testUnMergedReader() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; final int numRecords = 1000; final int firstBatchLastRecordKey = numRecords - 1; final int secondBatchLastRecordKey = 2 * numRecords - 1; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime); InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // insert new records to log file String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0); long size = writer.getCurrentSize(); writer.close();//from ww w . j av a 2s . co m assertTrue("block - size should be > 0", size > 0); //create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set("partition_columns", "datestr"); // Enable merge skipping. jobConf.set("hoodie.realtime.merge.skip", "true"); //validate unmerged record reader RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader); //use reader to read base Parquet File and log file //here all records should be present. Also ensure log records are in order. NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsAtCommit1 = 0; int numRecordsAtCommit2 = 0; Set<Integer> seenKeys = new HashSet<>(); Integer lastSeenKeyFromLog = firstBatchLastRecordKey; while (recordReader.next(key, value)) { Writable[] values = value.get(); String gotCommit = values[0].toString(); String keyStr = values[2].toString(); Integer gotKey = Integer.parseInt(keyStr.substring("key".length())); if (gotCommit.equals(newCommitTime)) { numRecordsAtCommit2++; Assert.assertTrue(gotKey > firstBatchLastRecordKey); Assert.assertTrue(gotKey <= secondBatchLastRecordKey); Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1); lastSeenKeyFromLog++; } else { numRecordsAtCommit1++; Assert.assertTrue(gotKey >= 0); Assert.assertTrue(gotKey <= firstBatchLastRecordKey); } // Ensure unique key Assert.assertFalse(seenKeys.contains(gotKey)); seenKeys.add(gotKey); key = recordReader.createKey(); value = recordReader.createValue(); } Assert.assertEquals(numRecords, numRecordsAtCommit1); Assert.assertEquals(numRecords, numRecordsAtCommit2); Assert.assertEquals(2 * numRecords, seenKeys.size()); }
From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testReaderWithNestedAndComplexSchema() throws Exception { // initial commit Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);/*from w w w . j a v a2s . co m*/ InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); // update files or generate new log file String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords); long size = writer.getCurrentSize(); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); //create a split with baseFile (parquet file written earlier) and new log file(s) String logFilePath = writer.getLogFile().getPath().toString(); HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); // validate record reader compaction HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); // use reader to read base Parquet File and log file, merge in flight and return latest commit // here the first 50 records should be updated, see above NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); int numRecordsRead = 0; while (recordReader.next(key, value)) { int currentRecordNo = numRecordsRead; ++numRecordsRead; Writable[] values = value.get(); String recordCommitTime; //check if the record written is with latest commit, here "101" if (numRecordsRead > numberOfLogRecords) { recordCommitTime = commitTime; } else { recordCommitTime = newCommitTime; } String recordCommitTimeSuffix = "@" + recordCommitTime; Assert.assertEquals(values[0].toString(), recordCommitTime); key = recordReader.createKey(); value = recordReader.createValue(); // Assert type STRING Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo); Assert.assertEquals("test value for field: field2", values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo); // Assert type INT IntWritable intWritable = (IntWritable) values[8]; Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(), currentRecordNo + recordCommitTime.hashCode()); // Assert type LONG LongWritable longWritable = (LongWritable) values[9]; Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(), currentRecordNo + recordCommitTime.hashCode()); // Assert type FLOAT FloatWritable floatWritable = (FloatWritable) values[10]; Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(), (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0); // Assert type DOUBLE DoubleWritable doubleWritable = (DoubleWritable) values[11]; Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(), (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0); // Assert type MAP ArrayWritable mapItem = (ArrayWritable) values[12]; Writable mapItemValue1 = mapItem.get()[0]; Writable mapItemValue2 = mapItem.get()[1]; Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get()[0].toString(), "mapItem1"); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get()[0].toString(), "mapItem2"); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get().length, 2); Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get().length, 2); Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1]; Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1]; Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1", ((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo); Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", ((ArrayWritable) mapItemValue1value).get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix); Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", ((ArrayWritable) mapItemValue2value).get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix); // Assert type RECORD ArrayWritable recordItem = (ArrayWritable) values[13]; Writable[] nestedRecord = recordItem.get(); Assert.assertEquals("test value for field: testNestedRecord.isAdmin", ((BooleanWritable) nestedRecord[0]).get(), false); Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), "UserId" + currentRecordNo + recordCommitTimeSuffix); // Assert type ARRAY ArrayWritable arrayValue = (ArrayWritable) values[14]; Writable[] arrayValues = arrayValue.get(); for (int i = 0; i < arrayValues.length; i++) { Assert.assertEquals("test value for field: stringArray", "stringArray" + i + recordCommitTimeSuffix, arrayValues[i].toString()); } } }
From source file:com.uber.hoodie.hadoop.realtime.RealtimeCompactedRecordReader.java
License:Apache License
@Override public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOException { // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable // with a new block of values boolean result = this.parquetReader.next(aVoid, arrayWritable); if (!result) { // if the result is false, then there are no more records return false; } else {//from www. jav a 2 s. co m // TODO(VC): Right now, we assume all records in log, have a matching base record. (which // would be true until we have a way to index logs too) // return from delta records map if we have some match. String key = arrayWritable.get()[HoodieRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS].toString(); if (deltaRecordMap.containsKey(key)) { // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the // deltaRecord may not be a full record and needs values of columns from the parquet Optional<GenericRecord> rec; if (usesCustomPayload) { rec = deltaRecordMap.get(key).getData().getInsertValue(getWriterSchema()); } else { rec = deltaRecordMap.get(key).getData().getInsertValue(getReaderSchema()); } if (!rec.isPresent()) { // If the record is not present, this is a delete record using an empty payload so skip this base record // and move to the next record return next(aVoid, arrayWritable); } GenericRecord recordToReturn = rec.get(); if (usesCustomPayload) { // If using a custom payload, return only the projection fields recordToReturn = HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(rec.get(), getReaderSchema()); } // we assume, a later safe record in the log, is newer than what we have in the map & // replace it. ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(recordToReturn, getWriterSchema()); Writable[] replaceValue = aWritable.get(); if (LOG.isDebugEnabled()) { LOG.debug(String.format("key %s, base values: %s, log values: %s", key, arrayWritableToString(arrayWritable), arrayWritableToString(aWritable))); } Writable[] originalValue = arrayWritable.get(); try { System.arraycopy(replaceValue, 0, originalValue, 0, originalValue.length); arrayWritable.set(originalValue); } catch (RuntimeException re) { LOG.error("Got exception when doing array copy", re); LOG.error("Base record :" + arrayWritableToString(arrayWritable)); LOG.error("Log record :" + arrayWritableToString(aWritable)); throw re; } } return true; } }
From source file:com.uber.hoodie.hadoop.SafeParquetRecordReaderWrapper.java
License:Apache License
public SafeParquetRecordReaderWrapper(RecordReader<NullWritable, ArrayWritable> parquetReader) { this.parquetReader = parquetReader; ArrayWritable arrayWritable = parquetReader.createValue(); this.valueClass = arrayWritable.getValueClass(); this.numValueFields = arrayWritable.get().length; }
From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java
License:Apache License
/** * Cleans raw volume content/*from w w w . j av a 2 s . c o m*/ * * @param volume * volume content represented as a list of pages */ public static void cleanVolume(ArrayWritable volume) { Text[] pages = (Text[]) volume.get(); for (int i = 0; i < pages.length; i++) { // TODO: apply clean logic to each page String pageContent = pages[i].toString(); /** * Logic to clean the pageContent */ // Logic goes here /** * set the cleaned content back, pageContent is page content after * cleaning */ pages[i].set(pageContent); } // set cleaned volume content back volume.set(pages); }
From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java
License:Apache License
/** * Extracts interested words (to form full word set) * //w w w . j a v a 2s . c om * @param volume * volume content represented as a list of pages * @return interested words appearing in the specified volume */ public static Set<String> getWordSet(ArrayWritable volume) { Set<String> wordSet = new HashSet<String>(); Text[] pages = (Text[]) volume.get(); for (int i = 0; i < pages.length; i++) { // TODO: apply ogic to extract interested words from each page String pageContent = pages[i].toString(); /** * Logic to extract the interested words */ // Logic goes here /** * set extracted words to 'wordSet', something like * wordSet.addAll(Set<String> extractedWords) where 'extractedWords' * contains interested words extracted from 'pageContent'. Word * should be converted to lowercase */ } return wordSet; }
From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java
License:Apache License
/** * Transform cleaned volume of form raw text to indices of words in full * word set//from w w w. j a v a 2s . c o m * * @param volume * volume content represented as a list of pages * @param wordSet * set of interested words (full word set) */ public static void transformVolume(ArrayWritable volume, List<String> wordSet) { Text[] pages = (Text[]) volume.get(); for (int i = 0; i < pages.length; i++) { /** * Need change the RegExp when we are also interested in general * symbols other than words */ String[] tokens = pages[i].toString().toLowerCase().split("\\W+"); StringBuilder transformedPage = new StringBuilder(); int idx = -1; for (String token : tokens) { idx = wordSet.indexOf(token); /** * omit tokens that don't appear in word set */ if (idx != -1) { transformedPage.append(idx + " "); } } // remove ' ' at tail pages[i].set(transformedPage.toString().trim()); } // set volume volume.set(pages); }
From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java
License:Apache License
/** * convert volume content to indices in sub word set * /*from ww w.jav a 2 s .c o m*/ * @param volume * volume content represented as indices to full word set * @param mappingIndices */ public static void fullWordSet2SubWordSet(ArrayWritable volume, List<Integer> mappingIndices) { Text[] pages = (Text[]) volume.get(); for (int i = 0; i < pages.length; i++) { String[] indices = pages[i].toString().split(" "); StringBuilder pg = new StringBuilder(); for (int j = 0; j < indices.length; j++) { int idx = mappingIndices.indexOf(indices[j]); /** * omit words not in the sub set */ if (idx != -1) { pg.append(idx + " "); } } // remove ' ' at tail pages[i].set(pg.toString().trim()); } // set volume volume.set(pages); }
From source file:edu.ub.ahstfg.io.WritableConverter.java
License:Open Source License
/** * Converts long ArrayWritable to long LinkedList. * @param input Long ArrayWritable to convert. * @return Converted long LinkedList./* w w w . ja va2 s .com*/ */ public static LinkedList<Short> arrayWritable2LinkedListShort(ArrayWritable input) { LinkedList<Short> ret = new LinkedList<Short>(); Writable[] ws = input.get(); IntWritable l; for (Writable w : ws) { l = (IntWritable) w; ret.add((short) l.get()); } return ret; }