Example usage for org.apache.hadoop.io ArrayWritable get

List of usage examples for org.apache.hadoop.io ArrayWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io ArrayWritable get.

Prototype

public Writable[] get() 

Source Link

Usage

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

public void testReader(boolean partitioned) throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String baseInstant = "100";
    File partitionDir = partitioned
            ? InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, baseInstant)
            : InputFormatTestUtil.prepareNonPartitionedParquetDataset(basePath, schema, 1, 100, baseInstant);
    InputFormatTestUtil.commit(basePath, baseInstant);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    List<Pair<String, Integer>> logVersionsWithAction = new ArrayList<>();
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 1));
    logVersionsWithAction.add(Pair.of(HoodieTimeline.DELTA_COMMIT_ACTION, 2));
    // TODO: HUDI-154 Once Hive 2.x PR (PR-674) is merged, enable this change
    // logVersionsWithAction.add(Pair.of(HoodieTimeline.ROLLBACK_ACTION, 3));
    FileSlice fileSlice = new FileSlice(
            partitioned ? FSUtils.getRelativePartitionPath(new Path(basePath.getRoot().getAbsolutePath()),
                    new Path(partitionDir.getAbsolutePath())) : "default",
            baseInstant, "fileid0");
    logVersionsWithAction.stream().forEach(logVersionWithAction -> {
        try {//from w ww. ja v  a 2 s.c  o m
            // update files or generate new log file
            int logVersion = logVersionWithAction.getRight();
            String action = logVersionWithAction.getKey();
            int baseInstantTs = Integer.parseInt(baseInstant);
            String instantTime = String.valueOf(baseInstantTs + logVersion);
            String latestInstant = action.equals(HoodieTimeline.ROLLBACK_ACTION)
                    ? String.valueOf(baseInstantTs + logVersion - 2)
                    : instantTime;

            HoodieLogFormat.Writer writer = null;
            if (action.equals(HoodieTimeline.ROLLBACK_ACTION)) {
                writer = writeRollback(partitionDir, schema, "fileid0", baseInstant, instantTime,
                        String.valueOf(baseInstantTs + logVersion - 1), logVersion);
            } else {
                writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", baseInstant, instantTime, 100,
                        0, logVersion);
            }
            long size = writer.getCurrentSize();
            writer.close();
            assertTrue("block - size should be > 0", size > 0);

            //create a split with baseFile (parquet file written earlier) and new log file(s)
            fileSlice.addLogFile(writer.getLogFile());
            HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
                    new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + baseInstant + ".parquet"), 0, 1,
                            jobConf),
                    basePath.getRoot().getPath(),
                    fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator())
                            .map(h -> h.getPath().toString()).collect(Collectors.toList()),
                    instantTime);

            //create a RecordReader to be used by HoodieRealtimeRecordReader
            RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
                    new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf,
                    null);
            JobConf jobConf = new JobConf();
            List<Schema.Field> fields = schema.getFields();
            String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
            String postions = fields.stream().map(f -> String.valueOf(f.pos()))
                    .collect(Collectors.joining(","));
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
            jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
            if (partitioned) {
                jobConf.set("partition_columns", "datestr");
            }

            //validate record reader compaction
            HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

            //use reader to read base Parquet File and log file, merge in flight and return latest commit
            //here all 100 records should be updated, see above
            NullWritable key = recordReader.createKey();
            ArrayWritable value = recordReader.createValue();
            while (recordReader.next(key, value)) {
                Writable[] values = value.get();
                //check if the record written is with latest commit, here "101"
                Assert.assertEquals(latestInstant, values[0].toString());
                key = recordReader.createKey();
                value = recordReader.createValue();
            }
        } catch (Exception ioe) {
            throw new HoodieException(ioe.getMessage(), ioe);
        }
    });

    // Add Rollback last version to next log-file

}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testUnMergedReader() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    final int firstBatchLastRecordKey = numRecords - 1;
    final int secondBatchLastRecordKey = 2 * numRecords - 1;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numRecords, commitTime);
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // insert new records to log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime,
            newCommitTime, numRecords, numRecords, 0);
    long size = writer.getCurrentSize();
    writer.close();//from   ww w  . j av  a 2s . co  m
    assertTrue("block - size should be > 0", size > 0);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();
    String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
    String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
    jobConf.set("partition_columns", "datestr");
    // Enable merge skipping.
    jobConf.set("hoodie.realtime.merge.skip", "true");

    //validate unmerged record reader
    RealtimeUnmergedRecordReader recordReader = new RealtimeUnmergedRecordReader(split, jobConf, reader);

    //use reader to read base Parquet File and log file
    //here all records should be present. Also ensure log records are in order.
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsAtCommit1 = 0;
    int numRecordsAtCommit2 = 0;
    Set<Integer> seenKeys = new HashSet<>();
    Integer lastSeenKeyFromLog = firstBatchLastRecordKey;
    while (recordReader.next(key, value)) {
        Writable[] values = value.get();
        String gotCommit = values[0].toString();
        String keyStr = values[2].toString();
        Integer gotKey = Integer.parseInt(keyStr.substring("key".length()));
        if (gotCommit.equals(newCommitTime)) {
            numRecordsAtCommit2++;
            Assert.assertTrue(gotKey > firstBatchLastRecordKey);
            Assert.assertTrue(gotKey <= secondBatchLastRecordKey);
            Assert.assertEquals(gotKey.intValue(), lastSeenKeyFromLog + 1);
            lastSeenKeyFromLog++;
        } else {
            numRecordsAtCommit1++;
            Assert.assertTrue(gotKey >= 0);
            Assert.assertTrue(gotKey <= firstBatchLastRecordKey);
        }
        // Ensure unique key
        Assert.assertFalse(seenKeys.contains(gotKey));
        seenKeys.add(gotKey);
        key = recordReader.createKey();
        value = recordReader.createValue();
    }
    Assert.assertEquals(numRecords, numRecordsAtCommit1);
    Assert.assertEquals(numRecords, numRecordsAtCommit2);
    Assert.assertEquals(2 * numRecords, seenKeys.size());
}

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testReaderWithNestedAndComplexSchema() throws Exception {
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    int numberOfRecords = 100;
    int numberOfLogRecords = numberOfRecords / 2;
    File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords,
            commitTime);/*from w w  w .  j  a v a2s  . co  m*/
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());

    // update files or generate new log file
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime,
            numberOfLogRecords);
    long size = writer.getCurrentSize();
    writer.close();
    assertTrue("block - size should be > 0", size > 0);
    InputFormatTestUtil.deltaCommit(basePath, newCommitTime);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    String logFilePath = writer.getLogFile().getPath().toString();
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();

    String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(","));
    String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
    jobConf.set("partition_columns", "datestr");

    // validate record reader compaction
    HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);

    // use reader to read base Parquet File and log file, merge in flight and return latest commit
    // here the first 50 records should be updated, see above
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    int numRecordsRead = 0;
    while (recordReader.next(key, value)) {
        int currentRecordNo = numRecordsRead;
        ++numRecordsRead;
        Writable[] values = value.get();
        String recordCommitTime;
        //check if the record written is with latest commit, here "101"
        if (numRecordsRead > numberOfLogRecords) {
            recordCommitTime = commitTime;
        } else {
            recordCommitTime = newCommitTime;
        }
        String recordCommitTimeSuffix = "@" + recordCommitTime;

        Assert.assertEquals(values[0].toString(), recordCommitTime);
        key = recordReader.createKey();
        value = recordReader.createValue();

        // Assert type STRING
        Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo);
        Assert.assertEquals("test value for field: field2", values[6].toString(),
                "field" + currentRecordNo + recordCommitTimeSuffix);
        Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo);

        // Assert type INT
        IntWritable intWritable = (IntWritable) values[8];
        Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(),
                currentRecordNo + recordCommitTime.hashCode());

        // Assert type LONG
        LongWritable longWritable = (LongWritable) values[9];
        Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(),
                currentRecordNo + recordCommitTime.hashCode());

        // Assert type FLOAT
        FloatWritable floatWritable = (FloatWritable) values[10];
        Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(),
                (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0);

        // Assert type DOUBLE
        DoubleWritable doubleWritable = (DoubleWritable) values[11];
        Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(),
                (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0);

        // Assert type MAP
        ArrayWritable mapItem = (ArrayWritable) values[12];
        Writable mapItemValue1 = mapItem.get()[0];
        Writable mapItemValue2 = mapItem.get()[1];

        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get()[0].toString(),
                "mapItem1");
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get()[0].toString(),
                "mapItem2");
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue1).get().length, 2);
        Assert.assertEquals("test value for field: tags", ((ArrayWritable) mapItemValue2).get().length, 2);
        Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1];
        Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1];
        Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1",
                ((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo);
        Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1",
                ((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo);
        Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2",
                ((ArrayWritable) mapItemValue1value).get()[1].toString(),
                "item" + currentRecordNo + recordCommitTimeSuffix);
        Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2",
                ((ArrayWritable) mapItemValue2value).get()[1].toString(),
                "item2" + currentRecordNo + recordCommitTimeSuffix);

        // Assert type RECORD
        ArrayWritable recordItem = (ArrayWritable) values[13];
        Writable[] nestedRecord = recordItem.get();
        Assert.assertEquals("test value for field: testNestedRecord.isAdmin",
                ((BooleanWritable) nestedRecord[0]).get(), false);
        Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(),
                "UserId" + currentRecordNo + recordCommitTimeSuffix);

        // Assert type ARRAY
        ArrayWritable arrayValue = (ArrayWritable) values[14];
        Writable[] arrayValues = arrayValue.get();
        for (int i = 0; i < arrayValues.length; i++) {
            Assert.assertEquals("test value for field: stringArray", "stringArray" + i + recordCommitTimeSuffix,
                    arrayValues[i].toString());
        }
    }
}

From source file:com.uber.hoodie.hadoop.realtime.RealtimeCompactedRecordReader.java

License:Apache License

@Override
public boolean next(NullWritable aVoid, ArrayWritable arrayWritable) throws IOException {
    // Call the underlying parquetReader.next - which may replace the passed in ArrayWritable
    // with a new block of values
    boolean result = this.parquetReader.next(aVoid, arrayWritable);
    if (!result) {
        // if the result is false, then there are no more records
        return false;
    } else {//from  www.  jav a 2 s.  co m
        // TODO(VC): Right now, we assume all records in log, have a matching base record. (which
        // would be true until we have a way to index logs too)
        // return from delta records map if we have some match.
        String key = arrayWritable.get()[HoodieRealtimeInputFormat.HOODIE_RECORD_KEY_COL_POS].toString();
        if (deltaRecordMap.containsKey(key)) {
            // TODO(NA): Invoke preCombine here by converting arrayWritable to Avro. This is required since the
            // deltaRecord may not be a full record and needs values of columns from the parquet
            Optional<GenericRecord> rec;
            if (usesCustomPayload) {
                rec = deltaRecordMap.get(key).getData().getInsertValue(getWriterSchema());
            } else {
                rec = deltaRecordMap.get(key).getData().getInsertValue(getReaderSchema());
            }
            if (!rec.isPresent()) {
                // If the record is not present, this is a delete record using an empty payload so skip this base record
                // and move to the next record
                return next(aVoid, arrayWritable);
            }
            GenericRecord recordToReturn = rec.get();
            if (usesCustomPayload) {
                // If using a custom payload, return only the projection fields
                recordToReturn = HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(rec.get(),
                        getReaderSchema());
            }
            // we assume, a later safe record in the log, is newer than what we have in the map &
            // replace it.
            ArrayWritable aWritable = (ArrayWritable) avroToArrayWritable(recordToReturn, getWriterSchema());
            Writable[] replaceValue = aWritable.get();
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("key %s, base values: %s, log values: %s", key,
                        arrayWritableToString(arrayWritable), arrayWritableToString(aWritable)));
            }
            Writable[] originalValue = arrayWritable.get();
            try {
                System.arraycopy(replaceValue, 0, originalValue, 0, originalValue.length);
                arrayWritable.set(originalValue);
            } catch (RuntimeException re) {
                LOG.error("Got exception when doing array copy", re);
                LOG.error("Base record :" + arrayWritableToString(arrayWritable));
                LOG.error("Log record :" + arrayWritableToString(aWritable));
                throw re;
            }
        }
        return true;
    }
}

From source file:com.uber.hoodie.hadoop.SafeParquetRecordReaderWrapper.java

License:Apache License

public SafeParquetRecordReaderWrapper(RecordReader<NullWritable, ArrayWritable> parquetReader) {
    this.parquetReader = parquetReader;
    ArrayWritable arrayWritable = parquetReader.createValue();
    this.valueClass = arrayWritable.getValueClass();
    this.numValueFields = arrayWritable.get().length;
}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * Cleans raw volume content/*from  w  w  w  .  j  av a 2 s .  c  o  m*/
 * 
 * @param volume
 *            volume content represented as a list of pages
 */
public static void cleanVolume(ArrayWritable volume) {
    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {
        // TODO: apply clean logic to each page

        String pageContent = pages[i].toString();

        /**
         * Logic to clean the pageContent
         */

        // Logic goes here

        /**
         * set the cleaned content back, pageContent is page content after
         * cleaning
         */
        pages[i].set(pageContent);
    }

    // set cleaned volume content back
    volume.set(pages);
}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * Extracts interested words (to form full word set)
 * //w  w  w .  j a v a  2s  .  c  om
 * @param volume
 *            volume content represented as a list of pages
 * @return interested words appearing in the specified volume
 */
public static Set<String> getWordSet(ArrayWritable volume) {
    Set<String> wordSet = new HashSet<String>();

    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {
        // TODO: apply ogic to extract interested words from each page

        String pageContent = pages[i].toString();

        /**
         * Logic to extract the interested words
         */

        // Logic goes here

        /**
         * set extracted words to 'wordSet', something like
         * wordSet.addAll(Set<String> extractedWords) where 'extractedWords'
         * contains interested words extracted from 'pageContent'. Word
         * should be converted to lowercase
         */

    }

    return wordSet;
}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * Transform cleaned volume of form raw text to indices of words in full
 * word set//from  w  w w.  j  a v  a  2s .  c  o  m
 * 
 * @param volume
 *            volume content represented as a list of pages
 * @param wordSet
 *            set of interested words (full word set)
 */
public static void transformVolume(ArrayWritable volume, List<String> wordSet) {

    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {

        /**
         * Need change the RegExp when we are also interested in general
         * symbols other than words
         */
        String[] tokens = pages[i].toString().toLowerCase().split("\\W+");

        StringBuilder transformedPage = new StringBuilder();

        int idx = -1;

        for (String token : tokens) {
            idx = wordSet.indexOf(token);

            /**
             * omit tokens that don't appear in word set
             */
            if (idx != -1) {
                transformedPage.append(idx + " ");
            }

        }

        // remove ' ' at tail
        pages[i].set(transformedPage.toString().trim());
    }

    // set volume
    volume.set(pages);

}

From source file:edu.indiana.d2i.htrc.corpus.CorpusProcessingUtils.java

License:Apache License

/**
 * convert volume content to indices in sub word set
 * /*from   ww w.jav a  2  s .c o m*/
 * @param volume
 *            volume content represented as indices to full word set
 * @param mappingIndices
 */
public static void fullWordSet2SubWordSet(ArrayWritable volume, List<Integer> mappingIndices) {
    Text[] pages = (Text[]) volume.get();

    for (int i = 0; i < pages.length; i++) {
        String[] indices = pages[i].toString().split(" ");

        StringBuilder pg = new StringBuilder();

        for (int j = 0; j < indices.length; j++) {

            int idx = mappingIndices.indexOf(indices[j]);

            /**
             * omit words not in the sub set
             */
            if (idx != -1) {
                pg.append(idx + " ");
            }
        }

        // remove ' ' at tail
        pages[i].set(pg.toString().trim());
    }

    // set volume
    volume.set(pages);
}

From source file:edu.ub.ahstfg.io.WritableConverter.java

License:Open Source License

/**
 * Converts long ArrayWritable to long LinkedList.
 * @param input Long ArrayWritable to convert.
 * @return Converted long LinkedList./*  w  w  w . ja va2  s  .com*/
 */
public static LinkedList<Short> arrayWritable2LinkedListShort(ArrayWritable input) {
    LinkedList<Short> ret = new LinkedList<Short>();
    Writable[] ws = input.get();
    IntWritable l;
    for (Writable w : ws) {
        l = (IntWritable) w;
        ret.add((short) l.get());
    }
    return ret;
}