Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java

License:Apache License

/**
 * Decide the start of the reader.//from w w w  . ja va 2  s. c om
 */
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // if (codec instanceof CryptoCodec && job instanceof JobConf)
    // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec,
    // (JobConf) job, file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
        filePosition = fileIn;
    }
    LOG.info("Read from " + split.getPath().toString());
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));

        // Read another line as previous.

        Text current = new Text();

        int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start));

        LOG.info("Skip line " + previous + " for last split.");

        start += newSize;

        // Keep reading until a splitable point is found.
        while (start <= end) {
            newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
            if (canSplit(previous.getBytes(), current.getBytes())) {
                break;
            }
            start += newSize;
            previous.set(current.getBytes());
            LOG.info("Skip line " + previous + " for last split.");
        }

        // If exceed the end, still read one extra line.
        if (start > end) {
            if (isContinue) {
                newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
                if (!canSplit(previous.getBytes(), current.getBytes())) {
                    // Still not splitable. So skip the block.
                    start += newSize;
                    isContinue = false;
                }
            }
        }
        LOG.info("Split between: \n" + previous + "\n" + current);

        // Restart at the last read line.
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        this.pos = start;
    } else {
        Text skip = new Text();
        start += in.readLine(skip, maxLineLength, maxBytesToConsume(start));
        // start += in.readLine(skip, 0, maxBytesToConsume(start));
        LOG.info("Skip line " + skip + ". Start at " + start);
    }

    // Restart at the start index.
}

From source file:com.cloudera.castagna.logparser.pig.LogLoader.java

License:Apache License

@Override
public Tuple getNext() throws IOException {
    try {/*from w w  w  .  ja  v a2s .  c  om*/
        boolean notDone = in.nextKeyValue();
        if (!notDone) {
            return null;
        }
        Text value = (Text) in.getCurrentValue();
        byte[] ba = value.getBytes();
        // make a copy of the bytes representing the input since
        // TextInputFormat will reuse the byte array           
        return mTupleFactory.newTuple(new DataByteArray(ba, 0, value.getLength()));
    } catch (InterruptedException e) {
        throw new IOException("Error getting input");
    }
}

From source file:com.cloudera.impala.hive.executor.TestUdf.java

License:Apache License

public Text evaluate(Text a) {
    if (a == null)
        return null;
    return new Text(a.getBytes());
}

From source file:com.cloudera.recordservice.examples.terasort.TeraValidate.java

License:Apache License

private static String textifyBytes(Text t) {
    BytesWritable b = new BytesWritable();
    b.set(t.getBytes(), 0, t.getLength());
    return b.toString();
}

From source file:com.dappervision.hbase.mapred.TypedBytesTableReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterator<Text> values,
        OutputCollector<TypedBytesWritable, TypedBytesWritable> outputCollector, Reporter arg3)
        throws IOException {
    byte[] keyBytes = key.getBytes();
    TypedBytesWritable keyWritable = new TypedBytesWritable();
    TypedBytesWritable valueWritable = new TypedBytesWritable();
    keyWritable.setValue(new Buffer(keyBytes));

    //merge the column family and qualifier
    HashMap<String, HashMap<String, String>> cfMap = new HashMap<String, HashMap<String, String>>();
    while (values.hasNext()) {
        Text value = values.next();
        String strVal = value.toString();
        //Separate column family with comma (:)
        //Separate the qualifier and value with equity
        String[] cf_qual_val_parts = strVal.split(":");
        String cf = cf_qual_val_parts[0];
        String qual_val = cf_qual_val_parts[1];
        String[] qual_val_parts = qual_val.split("=");
        String qual = qual_val_parts[0];
        String val = qual_val_parts[1];

        if (cfMap.get(cf) != null) {
            HashMap<String, String> qualMap = cfMap.get(cf);
            if (qualMap == null) {
                qualMap = new HashMap<String, String>();
            }//from   w w w  .ja  va2s .c  om
            qualMap.put(qual, val); // the duplicated key will be replaced, if using Buffer, we should do it ourselves
        } else {
            HashMap<String, String> qualMap = new HashMap<String, String>();
            qualMap.put(qual, val);
            cfMap.put(cf, qualMap);
        }
    }

    HashMap<Buffer, HashMap<Buffer, Buffer>> bufMap = new HashMap<Buffer, HashMap<Buffer, Buffer>>();
    Set<Entry<String, HashMap<String, String>>> entrySet = cfMap.entrySet();
    for (Entry<String, HashMap<String, String>> entry : entrySet) {
        HashMap<String, String> qualValMap = entry.getValue();

        HashMap<Buffer, Buffer> qualValBufMap = new HashMap<Buffer, Buffer>();
        for (Entry<String, String> qualValEntry : qualValMap.entrySet()) {
            qualValBufMap.put(new Buffer(qualValEntry.getKey().getBytes()),
                    new Buffer(qualValEntry.getValue().getBytes()));
        }

        bufMap.put(new Buffer(entry.getKey().getBytes()), qualValBufMap);
    }
    valueWritable.setValue(bufMap);

    outputCollector.collect(keyWritable, valueWritable);
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoinChanneledMapper.java

License:Apache License

protected void emit(Text grouping, WritableComparable secondarySort, OUTPUT_VALUE datum)
        throws IOException, InterruptedException {
    emitBytes(grouping.getBytes(), 0, grouping.getLength(), secondarySort, datum);
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoinChanneledMapper.java

License:Apache License

protected void emit(Text grouping, OUTPUT_VALUE datum) throws IOException, InterruptedException {
    emitBytes(grouping.getBytes(), 0, grouping.getLength(), null, datum);
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoinMultiChannelMapper.java

License:Apache License

protected void emit(Text grouping, WritableComparable secondarySort, Object datum, int channel)
        throws IOException, InterruptedException {
    emitBytes(grouping.getBytes(), 0, grouping.getLength(), secondarySort, datum, channel);
}

From source file:com.datasalt.utils.mapred.joiner.MultiJoinMultiChannelMapper.java

License:Apache License

protected void emit(Text grouping, Object datum, int channel) throws IOException, InterruptedException {
    emitBytes(grouping.getBytes(), 0, grouping.getLength(), null, datum, channel);
}

From source file:com.ebay.nest.io.sede.binarysortable.BinarySortableSerDe.java

License:Apache License

static Text deserializeText(InputByteBuffer buffer, boolean invert, Text r) throws IOException {
    // Get the actual length first
    int start = buffer.tell();
    int length = 0;
    do {/*  w  w  w. java2s  .  co m*/
        byte b = buffer.read(invert);
        if (b == 0) {
            // end of string
            break;
        }
        if (b == 1) {
            // the last char is an escape char. read the actual char
            buffer.read(invert);
        }
        length++;
    } while (true);

    if (length == buffer.tell() - start) {
        // No escaping happened, so we are already done.
        r.set(buffer.getData(), start, length);
    } else {
        // Escaping happened, we need to copy byte-by-byte.
        // 1. Set the length first.
        r.set(buffer.getData(), start, length);
        // 2. Reset the pointer.
        buffer.seek(start);
        // 3. Copy the data.
        byte[] rdata = r.getBytes();
        for (int i = 0; i < length; i++) {
            byte b = buffer.read(invert);
            if (b == 1) {
                // The last char is an escape char, read the actual char.
                // The serialization format escape \0 to \1, and \1 to \2,
                // to make sure the string is null-terminated.
                b = (byte) (buffer.read(invert) - 1);
            }
            rdata[i] = b;
        }
        // 4. Read the null terminator.
        byte b = buffer.read(invert);
        assert (b == 0);
    }
    return r;
}