Example usage for org.apache.hadoop.io Text getBytes

List of usage examples for org.apache.hadoop.io Text getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Returns the raw bytes; however, only data up to #getLength() is valid.

Usage

From source file:gaffer.accumulo.utils.IngestUtils.java

License:Apache License

/**
 * Get the existing splits from a table in Accumulo and write a splits file.
 * The number of splits is returned./*from   w w w .  j a v  a 2  s. c o  m*/
 * 
 * @param conn  An existing connection to an Accumulo instance
 * @param table  The table name
 * @param fs  The FileSystem in which to create the splits file
 * @param splitsFile  A path for the splits file
 * @return The number of splits in the table
 * @throws TableNotFoundException
 * @throws IOException
 */
public static int createSplitsFile(Connector conn, String table, FileSystem fs, Path splitsFile)
        throws TableNotFoundException, IOException {
    // Get the splits from the table
    Collection<Text> splits = conn.tableOperations().getSplits(table);

    // Write the splits to file
    if (splits.isEmpty()) {
        return 0;
    }
    PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true)));
    for (Text split : splits) {
        out.println(new String(Base64.encodeBase64(split.getBytes())));
    }
    out.close();

    return splits.size();
}

From source file:gaffer.accumulo.utils.IngestUtils.java

License:Apache License

/**
 * Given some split points, write a Base64 encoded splits file.
 * /*from   ww w .ja v a  2  s. c  o m*/
 * @param splits  The split points
 * @param fs  The FileSystem in which to create the splits file
 * @param splitsFile  The location of the output splits file
 * @throws IOException
 */
public static void writeSplitsFile(Collection<Text> splits, FileSystem fs, Path splitsFile) throws IOException {
    PrintStream out = null;
    try {
        out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true)));
        for (Text split : splits) {
            out.println(new String(Base64.encodeBase64(split.getBytes())));
        }
    } finally {
        IOUtils.closeStream(out);
    }
}

From source file:gaffer.accumulostore.utils.IngestUtils.java

License:Apache License

/**
 * Get the existing splits from a table in Accumulo and write a splits file.
 * The number of splits is returned.// w ww.ja va2  s  .  c om
 *
 * @param conn       - An existing connection to an Accumulo instance
 * @param table      - The table name
 * @param fs         - The FileSystem in which to create the splits file
 * @param splitsFile - A Path for the output splits file
 * @param maxSplits  - The maximum number of splits
 * @return The number of splits in the table
 * @throws IOException for any IO issues reading from the file system. Other accumulo exceptions are caught and wrapped in an IOException.
 */
public static int createSplitsFile(final Connector conn, final String table, final FileSystem fs,
        final Path splitsFile, final int maxSplits) throws IOException {
    LOGGER.info("Creating splits file in location {} from table {} with maximum splits {}", splitsFile, table,
            maxSplits);
    // Get the splits from the table
    Collection<Text> splits;
    try {
        splits = conn.tableOperations().listSplits(table, maxSplits);
    } catch (TableNotFoundException | AccumuloSecurityException | AccumuloException e) {
        throw new IOException(e.getMessage(), e);
    }
    // This should have returned at most maxSplits splits, but this is not implemented properly in MockInstance.
    if (splits.size() > maxSplits) {
        if (conn instanceof MockConnector) {
            LOGGER.info("Manually reducing the number of splits to {} due to MockInstance not implementing"
                    + " listSplits(table, maxSplits) properly", maxSplits);
        } else {
            LOGGER.info("Manually reducing the number of splits to {} (number of splits was {})", maxSplits,
                    splits.size());
        }
        final Collection<Text> filteredSplits = new TreeSet<>();
        final int outputEveryNth = splits.size() / maxSplits;
        LOGGER.info("Outputting every {}-th split from {} total", outputEveryNth, splits.size());
        int i = 0;
        for (final Text text : splits) {
            if (i % outputEveryNth == 0) {
                filteredSplits.add(text);
            }
            i++;
            if (filteredSplits.size() >= maxSplits) {
                break;
            }
        }
        splits = filteredSplits;
    }
    LOGGER.info("Found {} splits from table {}", splits.size(), table);

    try (final PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true)), false,
            CommonConstants.UTF_8)) {
        // Write the splits to file
        if (splits.isEmpty()) {
            out.close();
            return 0;
        }

        for (final Text split : splits) {
            out.println(new String(Base64.encodeBase64(split.getBytes()), CommonConstants.UTF_8));
        }
    }
    return splits.size();
}

From source file:gaffer.accumulostore.utils.IngestUtils.java

License:Apache License

/**
 * Given some split points, write a Base64 encoded splits file
 * <p>/*from  ww  w.  j  a  v  a  2s. c o  m*/
 *
 * @param splits     - A Collection of splits
 * @param fs         - The FileSystem in which to create the splits file
 * @param splitsFile - A Path for the output splits file
 * @throws IOException for any IO issues writing to the file system.
 */
public static void writeSplitsFile(final Collection<Text> splits, final FileSystem fs, final Path splitsFile)
        throws IOException {
    try (final PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsFile, true)), false,
            CommonConstants.UTF_8)) {
        for (final Text split : splits) {
            out.println(new String(Base64.encodeBase64(split.getBytes()), CommonConstants.UTF_8));
        }
    }
}

From source file:gov.jgi.meta.hadoop.input.FastaBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, long maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;/*  w w w  .  j  a v a 2  s  .c  om*/
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '>');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a ">"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '>' || (totalBytesRead + bufferPosn - startPosn) <= maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '>') {
            j++;
        }

        j++; // skip the ">"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqBlockLineReader.java

License:Open Source License

public int readLine(Text key, Map<String, String> set, int maxLineLength, int maxBytesToConsume)
        throws IOException {

    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;// www  . ja  v  a  2  s.co m
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fasta record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }

    } while (buffer[bufferPosn++] != '@' || (totalBytesRead + bufferPosn - startPosn) < maxBytesToConsume);

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    Text k = new Text();
    Text s = new Text();
    int i = 1; // skip initial record seperator ">"
    int j = 1;
    do {
        k.clear();
        s.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        k.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            s.append(recordBlock.getBytes(), i, j - i - 1);
            set.put(k.toString(), s.toString().toLowerCase());

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength() && recordBlock.charAt(j) != '@') {
            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.hadoop.input.FastqLineReader.java

License:Open Source License

/**
 * Read one line from the InputStream into the given Text.  A line
 * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
 * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
 * line./*ww w .j a  v a  2 s  .  c  o  m*/
 *
 * @param str               the object to store the given line (without newline)
 * @param maxLineLength     the maximum number of bytes to store into str;
 *                          the rest of the line is silently discarded.
 * @param maxBytesToConsume the maximum number of bytes to consume
 *                          in this call.  This is only a hint, because if the line cross
 *                          this threshold, we allow it to happen.  It can overshoot
 *                          potentially by as much as one buffer length.
 * @return the number of bytes read including the (longest) newline
 *         found.
 * @throws java.io.IOException if the underlying stream throws
 */
public int readLine(Text key, Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
    int totalBytesRead = 0;
    int numRecordsRead = 0;
    Boolean eof = false;
    int startPosn;
    Text recordBlock = new Text();

    /*
    first thing to do is to move forward till you see a start character
     */
    startPosn = bufferPosn;
    do {
        if (bufferPosn >= bufferLength) {
            totalBytesRead += bufferPosn - startPosn;
            bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
    } while (buffer[bufferPosn++] != '@');

    /*
    if we hit the end of file already, then just return 0 bytes processed
     */
    if (eof)
        return totalBytesRead;

    /*
    now bufferPosn should be at the start of a fastq record
     */
    totalBytesRead += (bufferPosn - 1) - startPosn;
    startPosn = bufferPosn - 1; // startPosn guaranteed to be at a "@"

    /*
    find the next record start
     */
    eof = false;
    int numOfNewlines = 0;//Added by lanhin
    do {
        if (bufferPosn >= bufferLength) {

            /*
            copy the current buffer before refreshing the buffer
             */
            int appendLength = bufferPosn - startPosn;
            recordBlock.append(buffer, startPosn, appendLength);
            totalBytesRead += appendLength;

            startPosn = bufferPosn = 0;
            bufferLength = in.read(buffer);
            if (bufferLength <= 0) {
                eof = true;
                break; // EOF
            }
        }
        //Modefied by lanhin
        if (buffer[bufferPosn] == CR || buffer[bufferPosn] == LF) {
            numOfNewlines++;
        }
        if ((numOfNewlines >= 4) && buffer[bufferPosn] == '@') {
            bufferPosn++;
            break;
        }
        bufferPosn++;
    } while (true);//buffer[bufferPosn++] != '@');  // only read one record at a time
    //Modefied by lanhin end

    if (!eof) {
        bufferPosn--; // make sure we leave bufferPosn pointing to the next record
        int appendLength = bufferPosn - startPosn;
        recordBlock.append(buffer, startPosn, appendLength);
        totalBytesRead += appendLength;
    }

    /*
    record block now has the byte array we want to process for reads
     */

    int i = 1; // skip initial record seperator "@"
    int j = 1;
    do {
        key.clear();
        str.clear();
        /*
        first parse the key
         */
        i = j;
        Boolean junkOnLine = false;
        while (j < recordBlock.getLength()) {
            int c = recordBlock.charAt(j++);
            if (c == CR || c == LF) {
                break;
            } else if (c == ' ' || c == '\t') {
                junkOnLine = true;
                break;
            }
        }
        key.append(recordBlock.getBytes(), i, j - i - 1);

        /*
        in case there is additional metadata on the header line, ignore everything after
        the first word.
         */
        if (junkOnLine) {
            while (j < recordBlock.getLength() && recordBlock.charAt(j) != CR && recordBlock.charAt(j) != LF)
                j++;
        }

        //LOG.info ("key = " + k.toString());

        /*
        now skip the newlines
        */
        while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
            j++;

        /*
        now read the sequence
        */
        do {
            i = j;
            while (j < recordBlock.getLength()) {
                int c = recordBlock.charAt(j++);
                if (c == CR || c == LF) {
                    break;
                }
            }
            str.append(recordBlock.getBytes(), i, j - i - 1);

            while (j < recordBlock.getLength() && (recordBlock.charAt(j) == CR || recordBlock.charAt(j) == LF))
                j++;

        } while (j < recordBlock.getLength() && recordBlock.charAt(j) != '+');

        numRecordsRead++;

        /*
        now skip characters (newline or carige return most likely) till record start
        */
        while (j < recordBlock.getLength()) {
            // && recordBlock.charAt(j) != '@') {  // Modified by lanhin
            /* Should go straight to the end of recordBlock,
               ignore all the left info.  --lanhin*/

            j++;
        }

        j++; // skip the "@"

    } while (j < recordBlock.getLength());

    return totalBytesRead;
}

From source file:gov.jgi.meta.pig.storage.FastaStorage.java

License:Open Source License

/**
 * returns the next sequence from the block
 *//*from  ww w .  ja v a2s .  c  o  m*/
@Override
public Tuple getNext() throws IOException {

    if (mProtoTuple == null) {
        mProtoTuple = new ArrayList<Object>();
    }

    try {
        boolean notDone = in.nextKeyValue();
        if (!notDone) {
            return (null);
        }

        /*
          check the id of the sequence to see if its a paired read
         */
        String seqid = (in.getCurrentKey()).toString();
        String seqkey = null;
        String seqkey2;
        String header = "";
        String direction;
        for (int i = 0; i < seqid.length(); i++) {
            if (seqid.charAt(i) == ' ' || seqid.charAt(i) == '\t') {
                seqkey = seqid.substring(0, i);
                header = seqid.substring(i, seqid.length());
                break;
            }
        }
        if (seqkey == null)
            seqkey = seqid;
        if (seqkey.indexOf("/") >= 0) {
            String[] a = seqkey.split("/");
            seqkey2 = a[0];
            direction = a[1];
        } else {
            seqkey2 = seqkey;
            direction = "0";
        }
        Text value = ((Text) in.getCurrentValue());
        mProtoTuple.add(new DataByteArray(seqkey2.getBytes(), 0, seqkey2.length())); // add key
        mProtoTuple.add(new DataByteArray(direction.getBytes(), 0, direction.length())); // add direction
        mProtoTuple.add(new DataByteArray(value.getBytes(), 0, value.getLength())); // add sequence
        mProtoTuple.add(new DataByteArray(header.getBytes(), 0, header.length())); // add header

        Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple);
        mProtoTuple = null;
        return (t);
    } catch (InterruptedException e) {
        int errCode = 6018;
        String errMsg = "Error while reading input";
        throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e);
    }
}

From source file:gov.jgi.meta.sequence.SequenceString.java

License:Open Source License

public static String byteArrayToSequence(Text seq) {
    init();/* w  ww.j  a  v  a 2s .  com*/

    StringBuffer sb = new StringBuffer();
    byte[] ba = seq.getBytes();

    for (int i = 0; i < seq.getLength(); i++) {
        sb.append(reverseHash.get(ba[i]));
    }

    return sb.toString();
}

From source file:gov.jgi.meta.sequence.SequenceStringCompress.java

License:Open Source License

/**
 * Third version//from w w w . ja  v a 2 s  .  c  o  m
 * Different input type
 */
// because Text.bytes.length is not always the right length to use.
public static String byteArrayToSequence(Text seq) {
    byte[] ba = seq.getBytes();

    return byteArrayToSequence(ba);
}