Example usage for org.apache.hadoop.io ArrayWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io ArrayWritable get.

Prototype

public Writable[] get()

Source Link

Usage

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

public void testReader(boolean partitioned) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = partitioned
            ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant)
            : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant);
    InputFormatTestUtil.commit(basePath, baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(
            partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()),
                    new Path(partitionDir.getAbsolutePath())) : "default",
            baseInstant, "fileid0");
    logVersionsWithAction.stream().forEach(logVersionWithAction -> {
        try {//from w ww. ja v  a 2 s.c  o m
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION)
                    ? String.valueOf(baseInstantTs + logVersion - 2)
                    : instantTime;

            HoodieLogFormat.Writer writer = null;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime,
                        String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100,
                        0, logVersion);
            }
            long size = writer.getCurrentSize();
            writer.close();
            assertTrue("block - size should be > 0", size > 0);

            //create a split with baseFile (parquet file written earlier) and new log file(s)
            fileSlice.addLogFile(writer.getLogFile());
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
                    new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1,
                            jobConf),
                    basePath.getRoot().getPath(),
                    fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
                            .map(h -> h.getPath().toString()).collect(Collectors.toList()),
                    instantTime);

            //create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
                    new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf,
                    null);
            JobConf jobConf = new JobConf();
            List<Schema.Field> fields = schema.getFields();
            String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
            String postions = fields.stream().map(f -> String.valueOf(f.pos()))
                    .collect(Collectors.joining(","));
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
            if (partitioned) {
                jobConf.set("partition_columns", "datestr");
            }

            //validate record reader compaction
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

            //use reader to read base Parquet File and log file, merge in flight and return latest commit
            //here all 100 records should be updated, see above
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                //check if the record written is with latest commit, here "101"
                Assert.assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
            }
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);
        }
    });

    // Add Rollback last version to next log-file

}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testUnMergedReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    final int firstBatchLastRecordKey = numRecords - 1;
    final int secondBatchLastRecordKey = 2 * numRecords - 1;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime);
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // insert new records to log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime,
            newCommitTime, numRecords, numRecords, 0);
    long size = writer.getCurrentSize();
    writer.close();//from   ww w  . j av  a 2s . co  m
    assertTrue("block - size should be > 0", size > 0);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();
    String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
    String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    jobConf.set("partition_columns", "datestr");
    // Enable merge skipping.
    jobConf.set("hoodie.realtime.merge.skip", "true");

    //validate unmerged record reader
    RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader);

    //use reader to read base Parquet File and log file
    //here all records should be present. Also ensure log records are in order.
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsAtCommit1 = 0;
    int numRecordsAtCommit2 = 0;
    Set<Integer> seenKeys = new HashSet<>();
    Integer lastSeenKeyFromLog = firstBatchLastRecordKey;
    while (recordReader.next(key, value)) {
        Writable[] values = value.get();
        String gotCommit = values[0].toString();
        String keyStr = values[2].toString();
        Integer gotKey = Integer.parseInt(keyStr.substring("key".length()));
        if (gotCommit.equals(newCommitTime)) {
            numRecordsAtCommit2++;
            Assert.assertTrue(gotKey > firstBatchLastRecordKey);
            Assert.assertTrue(gotKey <= secondBatchLastRecordKey);
            Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1);
            lastSeenKeyFromLog++;
        } else {
            numRecordsAtCommit1++;
            Assert.assertTrue(gotKey >= 0);
            Assert.assertTrue(gotKey <= firstBatchLastRecordKey);
        }
        // Ensure unique key
        Assert.assertFalse(seenKeys.contains(gotKey));
        seenKeys.add(gotKey);
        key = recordReader.createKey();
        value = recordReader.createValue();
    }
    Assert.assertEquals(numRecords, numRecordsAtCommit1);
    Assert.assertEquals(numRecords, numRecordsAtCommit2);
    Assert.assertEquals(2 * numRecords, seenKeys.size());
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testReaderWithNestedAndComplexSchema() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    int numberOfRecords = 100;
    int numberOfLogRecords = numberOfRecords / 2;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords,
            commitTime);/*from w w  w .  j  a v a2s  . co  m*/
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // update files or generate new log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime,
            numberOfLogRecords);
    long size = writer.getCurrentSize();
    writer.close();
    assertTrue("block - size should be > 0", size > 0);
    InputFormatTestUtil.deltaCommit(basePath, newCommitTime);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();

    String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(","));
    String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
    jobConf.set("partition_columns", "datestr");

    // validate record reader compaction
    HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

    // use reader to read base Parquet File and log file, merge in flight and return latest commit
    // here the first 50 records should be updated, see above
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsRead = 0;
    while (recordReader.next(key, value)) {
        int currentRecordNo = numRecordsRead;
        ++numRecordsRead;
        Writable[] values = value.get();
        String recordCommitTime;
        //check if the record written is with latest commit, here "101"
        if (numRecordsRead > numberOfLogRecords) {
            recordCommitTime = commitTime;
        } else {
            recordCommitTime = newCommitTime;
        }
        String recordCommitTimeSuffix = "@" + recordCommitTime;

        Assert.assertEquals(values[0].toString(), recordCommitTime);
        key = recordReader.createKey();
        value = recordReader.createValue();

        // Assert type STRING
        Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo);
        Assert.assertEquals("test value for field: field2", values[6].toString(),
                "field" + currentRecordNo + recordCommitTimeSuffix);
        Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo);

        // Assert type INT
        IntWritable intWritable = (IntWritable) values[8];
        Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(),
                currentRecordNo + recordCommitTime.hashCode());

        // Assert type LONG
        LongWritable longWritable = (LongWritable) values[9];
        Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(),
                currentRecordNo + recordCommitTime.hashCode());

        // Assert type FLOAT
        FloatWritable floatWritable = (FloatWritable) values[10];
        Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(),
                (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0);

        // Assert type DOUBLE
        DoubleWritable doubleWritable = (DoubleWritable) values[11];
        Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(),
                (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0);

        // Assert type MAP
        ArrayWritable mapItem = (ArrayWritable) values[12];
        Writable mapItemValue1 = mapItem.get()[0];
        Writable mapItemValue2 = mapItem.get()[1];

        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get()[0].toString(),
                "mapItem1");
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get()[0].toString(),
                "mapItem2");
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get().length, 2);
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get().length, 2);
        Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1];
        Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1];
        Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1",
                ((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo);
        Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1",
                ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo);
        Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2",
                ((ArrayWritable) mapItemValue1value).get()[1].toString(),
                "item" + currentRecordNo + recordCommitTimeSuffix);
        Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2",
                ((ArrayWritable) mapItemValue2value).get()[1].toString(),
                "item2" + currentRecordNo + recordCommitTimeSuffix);

        // Assert type RECORD
        ArrayWritable recordItem = (ArrayWritable) values[13];
        Writable[] nestedRecord = recordItem.get();
        Assert.assertEquals("test value for field: testNestedRecord.isAdmin",
                ((BooleanWritable) nestedRecord[0]).get(), false);
        Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(),
                "UserId" + currentRecordNo + recordCommitTimeSuffix);

        // Assert type ARRAY
        ArrayWritable arrayValue = (ArrayWritable) values[14];
        Writable[] arrayValues = arrayValue.get();
        for (int i = 0; i < arrayValues.length; i++) {
            Assert.assertEquals("test value for field: stringArray", "stringArray" + i + recordCommitTimeSuffix,
                    arrayValues[i].toString());
        }
    }
}

From source file:com.uber.hoodie.hadoop.realtime.RealtimeCompactedRecordReader.java

License:Apache License

@Override
public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOException {
    // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable
    // with a new block of values
    boolean result = this.parquetReader.next(aVoid, arrayWritable);
    if (!result) {
        // if the result is false, then there are no more records
        return false;
    } else {//from  www.  jav a 2 s.  co m
        // TODO(VC): Right now, we assume all records in log, have a matching base record. (which
        // would be true until we have a way to index logs too)
        // return from delta records map if we have some match.
        String key = arrayWritable.get()[HoodieRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS].toString();
        if (deltaRecordMap.containsKey(key)) {
            // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the
            // deltaRecord may not be a full record and needs values of columns from the parquet
            Optional<GenericRecord> rec;
            if (usesCustomPayload) {
                rec = deltaRecordMap.get(key).getData().getInsertValue(getWriterSchema());
            } else {
                rec = deltaRecordMap.get(key).getData().getInsertValue(getReaderSchema());
            }
            if (!rec.isPresent()) {
                // If the record is not present, this is a delete record using an empty payload so skip this base record
                // and move to the next record
                return next(aVoid, arrayWritable);
            }
            GenericRecord recordToReturn = rec.get();
            if (usesCustomPayload) {
                // If using a custom payload, return only the projection fields
                recordToReturn = HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(rec.get(),
                        getReaderSchema());
            }
            // we assume, a later safe record in the log, is newer than what we have in the map &
            // replace it.
            ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(recordToReturn, getWriterSchema());
            Writable[] replaceValue = aWritable.get();
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("key %s, base values: %s, log values: %s", key,
                        arrayWritableToString(arrayWritable), arrayWritableToString(aWritable)));
            }
            Writable[] originalValue = arrayWritable.get();
            try {
                System.arraycopy(replaceValue, 0, originalValue, 0, originalValue.length);
                arrayWritable.set(originalValue);
            } catch (RuntimeException re) {
                LOG.error("Got exception when doing array copy", re);
                LOG.error("Base record :" + arrayWritableToString(arrayWritable));
                LOG.error("Log record :" + arrayWritableToString(aWritable));
                throw re;
            }
        }
        return true;
    }
}

From source file:com.uber.hoodie.hadoop.SafeParquetRecordReaderWrapper.java

License:Apache License

public SafeParquetRecordReaderWrapper(RecordReader<NullWritable, ArrayWritable> parquetReader) {
    this.parquetReader = parquetReader;
    ArrayWritable arrayWritable = parquetReader.createValue();
    this.valueClass = arrayWritable.getValueClass();
    this.numValueFields = arrayWritable.get().length;
}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * Cleans raw volume content/*from  w  w  w  .  j  av a 2 s .  c  o  m*/
 * 
 * @param volume
 *            volume content represented as a list of pages
 */
public static void cleanVolume(ArrayWritable volume) {
    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {
        // TODO: apply clean logic to each page

        String pageContent = pages[i].toString();

        /**
         * Logic to clean the pageContent
         */

        // Logic goes here

        /**
         * set the cleaned content back, pageContent is page content after
         * cleaning
         */
        pages[i].set(pageContent);
    }

    // set cleaned volume content back
    volume.set(pages);
}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * Extracts interested words (to form full word set)
 * //w  w  w .  j a v a  2s  .  c  om
 * @param volume
 *            volume content represented as a list of pages
 * @return interested words appearing in the specified volume
 */
public static Set<String> getWordSet(ArrayWritable volume) {
    Set<String> wordSet = new HashSet<String>();

    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {
        // TODO: apply ogic to extract interested words from each page

        String pageContent = pages[i].toString();

        /**
         * Logic to extract the interested words
         */

        // Logic goes here

        /**
         * set extracted words to 'wordSet', something like
         * wordSet.addAll(Set<String> extractedWords) where 'extractedWords'
         * contains interested words extracted from 'pageContent'. Word
         * should be converted to lowercase
         */

    }

    return wordSet;
}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * Transform cleaned volume of form raw text to indices of words in full
 * word set//from  w  w w.  j  a v  a  2s .  c  o  m
 * 
 * @param volume
 *            volume content represented as a list of pages
 * @param wordSet
 *            set of interested words (full word set)
 */
public static void transformVolume(ArrayWritable volume, List<String> wordSet) {

    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {

        /**
         * Need change the RegExp when we are also interested in general
         * symbols other than words
         */
        String[] tokens = pages[i].toString().toLowerCase().split("\\W+");

        StringBuilder transformedPage = new StringBuilder();

        int idx = -1;

        for (String token : tokens) {
            idx = wordSet.indexOf(token);

            /**
             * omit tokens that don't appear in word set
             */
            if (idx != -1) {
                transformedPage.append(idx + " ");
            }

        }

        // remove ' ' at tail
        pages[i].set(transformedPage.toString().trim());
    }

    // set volume
    volume.set(pages);

}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * convert volume content to indices in sub word set
 * /*from   ww w.jav a  2  s .c o m*/
 * @param volume
 *            volume content represented as indices to full word set
 * @param mappingIndices
 */
public static void fullWordSet2SubWordSet(ArrayWritable volume, List<Integer> mappingIndices) {
    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {
        String[] indices = pages[i].toString().split(" ");

        StringBuilder pg = new StringBuilder();

        for (int j = 0; j < indices.length; j++) {

            int idx = mappingIndices.indexOf(indices[j]);

            /**
             * omit words not in the sub set
             */
            if (idx != -1) {
                pg.append(idx + " ");
            }
        }

        // remove ' ' at tail
        pages[i].set(pg.toString().trim());
    }

    // set volume
    volume.set(pages);
}

From source file:edu.ub.ahstfg.io.WritableConverter.java

License:Open Source License

/**
 * Converts long ArrayWritable to long LinkedList.
 * @param input Long ArrayWritable to convert.
 * @return Converted long LinkedList./*  w  w  w . ja va2  s  .com*/
 */
public static LinkedList<Short> arrayWritable2LinkedListShort(ArrayWritable input) {
    LinkedList<Short> ret = new LinkedList<Short>();
    Writable[] ws = input.get();
    IntWritable l;
    for (Writable w : ws) {
        l = (IntWritable) w;
        ret.add((short) l.get());
    }
    return ret;
}