Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:cosmos.mapred.MediawikiMapper.java

License:Apache License

/**
 * Called once for each key/value pair in the input split. Most applications should override this, but the default is the identity function.
 *//*from w  w  w.j  a  v  a  2 s.c om*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    Object o;
    try {
        o = unmarshaller.unmarshal(new ByteArrayInputStream(value.getBytes(), 0, value.getLength()));
    } catch (JAXBException e) {
        throw new IOException("Couldn't unmarshall '" + value + "'", e);
    }

    PageType pageType = (PageType) o;

    Page page = pageTypeToPage(pageType);

    Value protobufValue = new Value(page.toByteArray());

    Mutation m = new Mutation(Long.toString(page.getId()));
    m.put(empty, empty, protobufValue);

    context.write(tableName, m);
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) {
        Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

        ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
        int cp;/* w ww  .  ja va2 s .  c om*/
        while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
            System.out.println(Integer.toHexString(cp));
        }
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void test() throws IOException {
        // vv TextTest
        Text t = new Text("hadoop");
        assertThat(t.getLength(), is(6));
        assertThat(t.getBytes().length, is(6));

        assertThat(t.charAt(2), is((int) 'd'));
        assertThat("Out of bounds", t.charAt(100), is(-1));
        // ^^ TextTest
    }//from  www . j  a  v  a2 s.c  o m

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void mutability() throws IOException {
        // vv TextTest-Mutability
        Text t = new Text("hadoop");
        t.set("pig");
        assertThat(t.getLength(), is(3));
        assertThat(t.getBytes().length, is(3));
        // ^^ TextTest-Mutability
    }//  w  ww  . j  av  a  2s .com

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void byteArrayNotShortened() throws IOException {
        // vv TextTest-ByteArrayNotShortened
        Text t = new Text("hadoop");
        t.set(/*[*/new Text("pig")/*]*/);
        assertThat(t.getLength(), is(3));
        assertThat("Byte length not shortened", t.getBytes().length, /*[*/is(6)/*]*/);
        // ^^ TextTest-ByteArrayNotShortened
    }//w  w  w .  j a v a2 s.c o  m

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * //  w ww .  j a v  a 2s.c  om
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJobTest.java

License:Apache License

@Test
public void testSplit() throws Exception {
    Text key = new Text("123_456789");

    // hard-split using array copy
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    assertEquals("123", fileName);
    assertEquals("456789", outputKey.toString());
}

From source file:diamondmapreduce.NLineRecordReader.java

License:Apache License

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (key == null) {
        key = new LongWritable();
    }/*from  w  ww  . j  ava2 s .  c  om*/
    key.set(pos);
    if (value == null) {
        value = new Text();
    }
    value.clear();
    final Text endline = new Text("\n");
    int newSize = 0;
    for (int i = 0; i < NLINESTOPROCESS; i++) {
        Text v = new Text();
        while (pos < end) {
            newSize = in.readLine(v, maxLineLength,
                    Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
            value.append(v.getBytes(), 0, v.getLength());
            value.append(endline.getBytes(), 0, endline.getLength());
            if (newSize == 0) {
                break;
            }
            pos += newSize;
            if (newSize < maxLineLength) {
                break;
            }
        }
    }
    if (newSize == 0) {
        key = null;
        value = null;
        return false;
    } else {
        return true;
    }
}

From source file:eastcircle.terasort.TotalOrderPartitioner.java

License:Apache License

/**
 * Given a sorted set of cut points, build a trie that will find the correct
 * partition quickly./* w  w w  . ja va 2 s. co  m*/
 * @param splits the list of cut points
 * @param lower the lower bound of partitions 0..numPartitions-1
 * @param upper the upper bound of partitions 0..numPartitions-1
 * @param prefix the prefix that we have already checked against
 * @param maxDepth the maximum depth we will build a trie for
 * @return the trie node that will divide the splits correctly
 */
private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) {
    int depth = prefix.getLength();
    if (depth >= maxDepth || lower == upper) {
        return new LeafTrieNode(depth, splits, lower, upper);
    }
    InnerTrieNode result = new InnerTrieNode(depth);
    Text trial = new Text(prefix);
    // append an extra byte on to the prefix
    trial.append(new byte[1], 0, 1);
    int currentBound = lower;
    for (int ch = 0; ch < 255; ++ch) {
        trial.getBytes()[depth] = (byte) (ch + 1);
        lower = currentBound;
        while (currentBound < upper) {
            if (splits[currentBound].compareTo(trial) >= 0) {
                break;
            }
            currentBound += 1;
        }
        trial.getBytes()[depth] = (byte) ch;
        result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth);
    }
    // pick up the rest
    trial.getBytes()[depth] = (byte) 255;
    result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth);
    return result;
}

From source file:edu.american.student.redis.hadoop.RedisBigTableRecordWriter.java

License:Apache License

@Override
public void write(RedisBigTableKey key, Text value) throws IOException, InterruptedException {
    try {/*ww  w.  j ava  2 s .c  o m*/
        foreman.write(table, key, value.getBytes());
    } catch (RedisForemanException e) {
        throw new IOException(MessageFactory.objective("Write key/value").objects(key, value).toString(), e);
    }
}