Example usage for org.apache.hadoop.io Text copyBytes

List of usage examples for org.apache.hadoop.io Text copyBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text copyBytes.

Prototype

public byte[] copyBytes() 

Source Link

Document

Get a copy of the bytes that is exactly the length of the data.

Usage

From source file:com.facebook.presto.accumulo.io.AccumuloPageSink.java

License:Apache License

/**
 * Converts a {@link Row} to an Accumulo mutation.
 *
 * @param row Row object//  w  w  w .  j av a2 s .c om
 * @param rowIdOrdinal Ordinal in the list of columns that is the row ID. This isn't checked at all, so I hope you're right. Also, it is expected that the list of column handles is sorted in ordinal order. This is a very demanding function.
 * @param columns All column handles for the Row, sorted by ordinal.
 * @param serializer Instance of {@link AccumuloRowSerializer} used to encode the values of the row to the Mutation
 * @return Mutation
 */
public static Mutation toMutation(Row row, int rowIdOrdinal, List<AccumuloColumnHandle> columns,
        AccumuloRowSerializer serializer) {
    // Set our value to the row ID
    Text value = new Text();
    Field rowField = row.getField(rowIdOrdinal);
    if (rowField.isNull()) {
        throw new PrestoException(INVALID_FUNCTION_ARGUMENT,
                "Column mapped as the Accumulo row ID cannot be null");
    }

    setText(rowField, value, serializer);

    // Iterate through all the column handles, setting the Mutation's columns
    Mutation mutation = new Mutation(value);

    // Store row ID in a special column
    mutation.put(ROW_ID_COLUMN, ROW_ID_COLUMN, new Value(value.copyBytes()));
    for (AccumuloColumnHandle columnHandle : columns) {
        // Skip the row ID ordinal
        if (columnHandle.getOrdinal() == rowIdOrdinal) {
            continue;
        }

        // If the value of the field is not null
        if (!row.getField(columnHandle.getOrdinal()).isNull()) {
            // Serialize the value to the text
            setText(row.getField(columnHandle.getOrdinal()), value, serializer);

            // And add the bytes to the Mutation
            mutation.put(columnHandle.getFamily().get(), columnHandle.getQualifier().get(),
                    new Value(value.copyBytes()));
        }
    }

    return mutation;
}

From source file:com.facebook.presto.accumulo.serializers.StringRowSerializer.java

License:Apache License

@Override
public byte[] encode(Type type, Object value) {
    Text text = new Text();
    if (Types.isArrayType(type)) {
        throw new PrestoException(NOT_SUPPORTED, "arrays are not (yet?) supported for StringRowSerializer");
    } else if (Types.isMapType(type)) {
        throw new PrestoException(NOT_SUPPORTED, "maps are not (yet?) supported for StringRowSerializer");
    } else if (type.equals(BIGINT) && value instanceof Integer) {
        setLong(text, ((Integer) value).longValue());
    } else if (type.equals(BIGINT) && value instanceof Long) {
        setLong(text, (Long) value);
    } else if (type.equals(BOOLEAN)) {
        setBoolean(text, value.equals(Boolean.TRUE));
    } else if (type.equals(DATE)) {
        setDate(text, (Date) value);
    } else if (type.equals(DOUBLE)) {
        setDouble(text, (Double) value);
    } else if (type.equals(INTEGER) && value instanceof Integer) {
        setInt(text, (Integer) value);
    } else if (type.equals(INTEGER) && value instanceof Long) {
        setInt(text, ((Long) value).intValue());
    } else if (type.equals(REAL)) {
        setFloat(text, (Float) value);
    } else if (type.equals(SMALLINT)) {
        setShort(text, (Short) value);
    } else if (type.equals(TIME)) {
        setTime(text, (Time) value);
    } else if (type.equals(TIMESTAMP)) {
        setTimestamp(text, (Timestamp) value);
    } else if (type.equals(TINYINT)) {
        setByte(text, (Byte) value);
    } else if (type.equals(VARBINARY) && value instanceof byte[]) {
        setVarbinary(text, (byte[]) value);
    } else if (type.equals(VARBINARY) && value instanceof Slice) {
        setVarbinary(text, ((Slice) value).getBytes());
    } else if (type.equals(VARCHAR) && value instanceof String) {
        setVarchar(text, ((String) value));
    } else if (type.equals(VARCHAR) && value instanceof Slice) {
        setVarchar(text, ((Slice) value).toStringUtf8());
    } else {/*from  www . j  a  v a 2 s  . c  om*/
        throw new PrestoException(NOT_SUPPORTED,
                format("StringLexicoder does not support encoding type %s, object class is %s", type,
                        value.getClass()));
    }

    return text.copyBytes();
}

From source file:com.facebook.presto.accumulo.tools.RewriteIndex.java

License:Apache License

private void addIndexEntries(Connector connector, AccumuloTable table, long start) {
    LOG.info(format("Scanning data table %s to add index entries", table.getFullTableName()));
    BatchScanner scanner = null;/*  w  w  w  . j av  a2  s  .c om*/
    BatchWriter indexWriter = null;
    try {
        // Create index writer and metrics writer, but we are never going to flush the metrics writer
        indexWriter = connector.createBatchWriter(table.getIndexTableName(), bwc);
        Indexer indexer = new Indexer(connector, table, indexWriter,
                table.getMetricsStorageInstance(connector).newWriter(table));
        LOG.info("Created indexer against " + table.getIndexTableName());

        scanner = connector.createBatchScanner(table.getFullTableName(), auths, 10);
        LOG.info(format("Created batch scanner against %s with auths %s", table.getFullTableName(), auths));

        IteratorSetting timestampFilter = new IteratorSetting(21, "timestamp", TimestampFilter.class);
        TimestampFilter.setRange(timestampFilter, 0L, start);
        scanner.addScanIterator(timestampFilter);

        scanner.setRanges(connector.tableOperations().splitRangeByTablets(table.getFullTableName(), new Range(),
                Integer.MAX_VALUE));

        long numRows = 0L;
        long numIndexEntries = 0L;
        Text prevRow = null;
        Text row = new Text();
        Text cf = new Text();
        Text cq = new Text();
        Mutation mutation = null;
        for (Entry<Key, Value> entry : scanner) {
            entry.getKey().getRow(row);
            entry.getKey().getColumnFamily(cf);
            entry.getKey().getColumnQualifier(cq);

            // if the rows do not match, index the mutation
            if (prevRow != null && !prevRow.equals(row)) {
                if (!dryRun) {
                    indexer.index(mutation);
                }
                ++numRows;
                mutation = null;

                if (numRows % 500000 == 0) {
                    if (dryRun) {
                        LOG.info(
                                format("In progress, would have re-indexed %s rows containing %s index entries",
                                        numRows, numIndexEntries));
                    } else {
                        LOG.info(format("In progress, re-indexed %s rows containing %s index entries", numRows,
                                numIndexEntries));
                    }
                }
            }

            if (mutation == null) {
                mutation = new Mutation(row);
            }

            mutation.put(cf, cq, entry.getKey().getColumnVisibilityParsed(), entry.getKey().getTimestamp(),
                    entry.getValue());
            if (table.getColumns().stream()
                    .filter(column -> column.isIndexed() && column.getFamily().isPresent()
                            && column.getQualifier().isPresent()
                            && column.getFamily().get().equals(new String(cf.copyBytes(), UTF_8))
                            && column.getQualifier().get().equals(new String(cq.copyBytes(), UTF_8)))
                    .count() > 0) {
                ++numIndexEntries;
            }

            if (prevRow == null) {
                prevRow = new Text(row);
            } else {
                prevRow.set(row);
            }
        }

        // Index the final mutation
        if (mutation != null) {
            if (!dryRun) {
                indexer.index(mutation);
            }
            ++numRows;
        }

        if (dryRun) {
            LOG.info(format(
                    "Finished dry run of rewriting index entries. Would have re-indexed %s rows containing %s index entries",
                    numRows, numIndexEntries));
        } else {
            LOG.info(format("Finished adding index entries. Re-indexed %s rows containing %s index entries",
                    numRows, numIndexEntries));
        }
    } catch (AccumuloException | AccumuloSecurityException e) {
        LOG.error("Accumulo exception", e);
    } catch (TableNotFoundException e) {
        LOG.error("Table not found, must have been deleted during process", e);
    } finally {
        if (indexWriter != null) {
            try {
                indexWriter.close();
            } catch (MutationsRejectedException e) {
                LOG.error("Server rejected mutations", e);
            }
        }

        if (scanner != null) {
            scanner.close();
        }
    }
}

From source file:com.facebook.presto.accumulo.tools.RewriteMetricsTask.java

License:Apache License

private void incrementTimestampMetric(Map<Text, Map<Text, Map<ColumnVisibility, AtomicLong>>> rowMap,
        Text family, ColumnVisibility visibility, Text timestampValue) {
    for (Entry<TimestampPrecision, Long> entry : getTruncatedTimestamps(
            serializer.decode(TIMESTAMP, timestampValue.copyBytes())).entrySet()) {
        Text timestampFamily = new Text(
                Bytes.concat(family.copyBytes(), TIMESTAMP_CARDINALITY_FAMILIES.get(entry.getKey())));

        Text row = new Text(serializer.encode(TIMESTAMP, entry.getValue()));
        Map<Text, Map<ColumnVisibility, AtomicLong>> familyMap = rowMap.get(row);
        if (familyMap == null) {
            familyMap = new HashMap<>();
            rowMap.put(row, familyMap);/*from   w  w  w .j  a  v  a  2s  .  com*/
        }

        Map<ColumnVisibility, AtomicLong> visibilityMap = familyMap.get(timestampFamily);
        if (visibilityMap == null) {
            visibilityMap = new HashMap<>();
            visibilityMap.put(new ColumnVisibility(), new AtomicLong(0));
            familyMap.put(timestampFamily, visibilityMap);
        }

        if (visibilityMap.containsKey(visibility)) {
            visibilityMap.get(visibility).incrementAndGet();
        } else {
            visibilityMap.put(visibility, new AtomicLong(1));
        }
    }
}

From source file:com.philiphubbard.digraph.MRBuildVerticesTest.java

License:Open Source License

private static void setupTest(Configuration conf) throws IOException {
    FileSystem fileSystem = FileSystem.get(conf);

    Path path = new Path(testInput);
    if (fileSystem.exists(path))
        fileSystem.delete(path, true);//from   w  w  w.  ja v a2 s .c o  m

    ArrayList<MRVertex> vertices = new ArrayList<MRVertex>();

    MRVertex v0 = new MRVertex(0, conf);
    v0.addEdgeTo(2);
    vertices.add(v0);

    MRVertex v1 = new MRVertex(1, conf);
    v1.addEdgeTo(2);
    vertices.add(v1);

    MRVertex v2 = new MRVertex(2, conf);
    v2.addEdgeTo(3);
    vertices.add(v2);

    MRVertex v3 = new MRVertex(3, conf);
    v3.addEdgeTo(4);
    vertices.add(v3);

    MRVertex v4 = new MRVertex(4, conf);
    v4.addEdgeTo(5);
    v4.addEdgeTo(6);
    vertices.add(v4);

    MRVertex v5 = new MRVertex(5, conf);
    vertices.add(v5);

    MRVertex v6 = new MRVertex(6, conf);
    v6.addEdgeTo(7);
    vertices.add(v6);

    MRVertex v7 = new MRVertex(7, conf);
    vertices.add(v7);

    FSDataOutputStream out = fileSystem.create(path);
    for (MRVertex vertex : vertices) {
        Text text = vertex.toText(MRVertex.EdgeFormat.EDGES_TO);
        byte[] bytes = text.copyBytes();
        for (byte b : bytes)
            out.write(b);
        out.write('\n');
    }
    out.close();

    fileSystem.close();
}

From source file:com.philiphubbard.sabe.MRAssemblerTest1.java

License:Open Source License

private static void setupTest(Configuration conf) throws IOException {
    FileSystem fileSystem = FileSystem.get(conf);

    Path path = new Path(testInput);
    if (fileSystem.exists(path))
        fileSystem.delete(path, true);//www.j  ava  2 s  . c o m

    ArrayList<Text> reads = new ArrayList<Text>();

    // Goal: AATTCGGCCTTCGGCAT

    reads.add(new Text("AATTCGGC\n"));
    reads.add(new Text("CTTCGGCAT\n"));

    reads.add(new Text("AATT\n"));
    reads.add(new Text("CGGCCTTCGGCAT\n"));

    reads.add(new Text("AATTCGGCCTTCG\n"));
    reads.add(new Text("GCAT\n"));

    FSDataOutputStream out = fileSystem.create(path);
    for (Text read : reads) {
        byte[] bytes = read.copyBytes();
        for (byte b : bytes)
            out.write(b);
    }
    out.close();

    fileSystem.close();
}

From source file:com.philiphubbard.sabe.MRAssemblerTest2.java

License:Open Source License

private static void setupTest(Configuration conf) throws IOException {
    FileSystem fileSystem = FileSystem.get(conf);

    Path path = new Path(testInput);
    if (fileSystem.exists(path))
        fileSystem.delete(path, true);//from  w  w  w.  ja  v  a2 s  .  c  om

    ArrayList<Text> reads = new ArrayList<Text>();

    // The expected result:
    // CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTGGCAGACCCGCGGGACGATCTCCTCTGACCCATCATCGAAATTCC
    // Note that it has the following pattern:
    // segment 0: CCCTTTCTGT 
    // segment 1, which will be repeated: TGACCCATCA 
    // segment 2: TTGTTTAGTA 
    // segment 3, which will be repeated: ACCCGCGGGA 
    // segment 4: TGCCTGGCAG 
    // segment 3, again: ACCCGCGGGA 
    // segment 5: CGATCTCCTC
    // segment 1, again: TGACCCATCA 
    // segment 6: TCGAAATTCC

    reads.add(new Text("CCCTTTC\n"));
    // Error: initial T omitted.
    reads.add(new Text("GTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTGGCAGACC"));
    reads.add(new Text("CGCGGGACGAT\n"));
    // Error: final C omitted.
    reads.add(new Text("CTCCTCTGACCCATCATCGAAATTC\n"));

    reads.add(new Text("CCCTTTCTGTTGACCCAT\n"));
    // Error: final C replaced with G.
    reads.add(new Text("CATTGTTTAGTAACCCGCGGGATGCCTGGCAGACG\n"));
    reads.add(new Text("CGCGGGACGATCTCCTCTGACCCATCATCGAAATTCC\n"));

    // Error: C at index 14 replaced with A.
    reads.add(new Text("CCCTTTCTGTTGACACATCATTGTTTAGTAAC"));
    reads.add(new Text("CCGCGGGATGCC\n"));
    // Error: C at index 25 omitted.
    reads.add(new Text("TGGCAGACCCGCGGGACGATCTCCTTGACCCATCATCGAAATTCC\n"));

    reads.add(new Text("CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGCCTG\n"));
    // Error: G at index 10 replaced with T.
    reads.add(new Text("GCAGACCCGCTGGACGA\n"));
    reads.add(new Text("TCTCCTCTGACCCATCATCGAAATTCC\n"));

    reads.add(new Text("CCCTTTCTGTTGACCCATCATTGTTTAGTAACCCGCGGGATGC"));
    // Error: final G omitted.
    reads.add(new Text("CTGGCAGACCCGC\n"));
    reads.add(new Text("GGACGATCTCCTCT\n"));
    // Error: CG at index 10 transposed to GC
    reads.add(new Text("GACCCATCATCGAAATTCC\n"));

    FSDataOutputStream out = fileSystem.create(path);
    for (Text read : reads) {
        byte[] bytes = read.copyBytes();
        for (byte b : bytes)
            out.write(b);
    }
    out.close();

    fileSystem.close();
}

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * /*from   w ww  .  jav a  2 s . c  o m*/
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:dz.lab.mapred.hbase.custom_output.StartsWithCountReducer_HBase.java

@Override
protected void reduce(Text key, Iterable<IntWritable> counts, Context context)
        throws IOException, InterruptedException {
    int sum = 0;/*  w  w w .j a  va2s  .  c om*/
    for (IntWritable count : counts) {
        sum += count.get();
    }
    // reducer must output either Put or Delete object
    Put put = new Put(key.copyBytes());
    put.add(toBytes(FAMILY), toBytes(RESULT_COLUMN), toBytes(Integer.toString(sum)));
    context.write(null, put);
}

From source file:io.fluo.stress.trie.Init.java

License:Apache License

private Collection<Text> writeSplits(FluoConfiguration props, FileSystem fs, Connector conn, Path splitsPath)
        throws Exception {
    Collection<Text> splits1 = conn.tableOperations().listSplits(props.getAccumuloTable());
    OutputStream out = new BufferedOutputStream(fs.create(splitsPath));
    for (Text split : splits1) {
        out.write(Base64.encodeBase64(split.copyBytes()));
        out.write('\n');
    }//  ww w. j a  v  a2 s.  co m

    out.close();
    return splits1;
}