Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:RunText.java

License:Apache License

private static List<String> parse(Text value) {
    int p = 0;//from   w  w w .j  a v a  2  s  .c  o m
    List<String> strings = Lists.newArrayList();
    while (p < value.getLength()) {
        int next = find(value, delim, p);
        if (next == -1) {
            break;
        }
        String s = new String(value.getBytes(), p, next - p);
        strings.add(s);
        p = next + 1;
    }
    return strings;
}

From source file:RunText.java

License:Apache License

private static int find(Text text, byte what, int start) {
    int len = text.getLength();
    int p = start;
    byte[] bytes = text.getBytes();
    boolean inQuotes = false;
    while (p < len) {
        if ('\"' == bytes[p]) {
            inQuotes = !inQuotes;// ww w.j a v a 2s . c o  m
        }
        if (!inQuotes && bytes[p] == what) {
            return p;
        }
        p++;
    }
    return -1;
}

From source file:Importer.java

License:Open Source License

public static Text hash(Text content) throws Exception {
    StringBuilder sb = new StringBuilder();
    sb.append("post_");

    MessageDigest md = MessageDigest.getInstance("MD5");

    md.update(content.getBytes(), 0, content.getLength());
    byte[] bytes = md.digest();
    for (int i = 0; i < bytes.length; ++i) {
        if ((bytes[i] & 0xF0) == 0)
            sb.append('0');
        sb.append(Integer.toHexString(0xFF & bytes[i]));
    }// w ww . j a va  2  s  .c o m
    return new Text(sb.toString());
}

From source file:Importer.java

License:Open Source License

public static void copyFile(File file) throws Exception {
    //    String TEST_PREFIX = "";
    File destFile = new File(outDir, file.getName() + ".seq");
    Path dest = new Path(destFile.getAbsolutePath());

    Configuration conf = new Configuration();
    FileSystem fileSys = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(conf.get("fs.default.name")),
            conf);//w  w  w .  j  av  a 2  s.  c  o  m
    CompressionCodec codec = new DefaultCodec();
    fileSys.mkdirs(dest.getParent());
    FSDataOutputStream outputStr = fileSys.create(dest);
    seqFileWriter = SequenceFile.createWriter(conf, outputStr, Text.class, Text.class,
            SequenceFile.CompressionType.BLOCK, codec);
    String filename = file.getName();
    InputStream in = new BufferedInputStream(new FileInputStream(file));
    if (filename.endsWith(".bz2")) {
        in.read();
        in.read(); //snarf header
        in = new CBZip2InputStream(in);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(in, "US-ASCII"));

    System.out.println("working on file " + file);
    int records = 0;
    long bytes = 0, bytes_since_status = 0;
    long startTime = System.currentTimeMillis();
    String s = null;
    Text content = new Text();
    while ((s = br.readLine()) != null) {
        if (s.startsWith("---END.OF.DOCUMENT---")) {
            Text name = new Text(hash(content));
            seqFileWriter.append(name, content);
            records++;
            content = new Text();
        } else {
            byte[] line_as_bytes = (s + " ").getBytes();
            for (byte b : line_as_bytes) {
                assert b < 128 : "found an unexpected high-bit set";
            }

            content.append(line_as_bytes, 0, line_as_bytes.length);
            bytes += line_as_bytes.length;
            /*
            bytes_since_status += line_as_bytes.length;
            if(bytes_since_status > 10 * 1024 * 1024) { //every 10 MB
              System.err.print('.');
              bytes_since_status = 0;
            }*/
        }
    } //end while
    if (content.getLength() > 5) {
        Text name = new Text(hash(content));
        seqFileWriter.append(name, content);
        records++;
    }
    totalBytes += bytes;
    totalRecords += records;
    long time = (System.currentTimeMillis() - startTime) / 1000 + 1;
    long kbSec = bytes / 1024 / time;
    System.out.println(new java.util.Date());
    System.out.println("File " + file.getName() + " " + records + " records, " + bytes + " bytes in " + time
            + " seconds (" + kbSec + " KB/sec).");
    in.close();
    seqFileWriter.close();
    outputStr.close();
}

From source file:TweetTweetTweet.java

License:Open Source License

@Override
public void fromText(Text text) {
    tweet1.fromText(text);//w  ww. jav a 2 s. c  o m
    // Skip the Tab
    text.set(text.getBytes(), 1, text.getLength() - 1);
    tweet2.fromText(text);
    // Skip the Tab
    text.set(text.getBytes(), 1, text.getLength() - 1);
    tweet3.fromText(text);
}

From source file:TestString.java

License:Apache License

@Test
public void testTextSubstring() throws Exception {
    Text text = new Text("string");
    Text text1 = new Text();
    Text text2 = new Text();

    long start = System.nanoTime();
    for (int i = 0; i < 100000000; i++) {
        text1.set(text.getBytes(), 0, 2);
        text2.set(text.getBytes(), 3, text.getLength() - 3);
    }/*w  w  w.jav a  2 s . c o m*/
    long end = System.nanoTime();
    System.out.println("TestTextSubString");
    System.out.println("text1: " + text1.toString());
    System.out.println("text2: " + text2.toString());
    System.out.println("Elapsed Time: " + (end - start) / 1000000000f + " seconds.");
}

From source file:accumulo.ingest.AbstractAccumuloCsvIngest.java

License:Apache License

protected void setRowId(Text buffer, Text fileName, long recordCount) {
    final byte[] rowSuffix = lex.encode(recordCount);
    buffer.clear();/* w ww  .ja v a  2 s.com*/
    buffer.append(fileName.getBytes(), 0, fileName.getLength());
    buffer.append(rowSuffix, 0, rowSuffix.length);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 */// w w  w . ja  va 2  s  . c om
protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
    Text buffer = new Text();

    if (true) { // (start > 0) // use start>0 to assume that files start with valid data
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        LineReader reader = new LineReader(stream);

        int bytesRead = 0;
        do {
            bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
            int bufferLength = buffer.getLength();
            if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
                start += bytesRead;
            } else {
                // line starts with @.  Read two more and verify that it starts with a +
                //
                // If this isn't the start of a record, we want to backtrack to its end
                long backtrackPosition = start + bytesRead;

                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                    break; // all good!
                } else {
                    // backtrack to the end of the record we thought was the start.
                    start = backtrackPosition;
                    stream.seek(start);
                    reader = new LineReader(stream);
                }
            }
        } while (bytesRead > 0);

        stream.seek(start);
    }

    pos = start;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Parses a read from an interleaved FASTQ file.
 *
 * Only reads a single record.//from ww  w .ja  v  a  2s.  co  m
 *
 * @param readName Text record containing read name. Output parameter.
 * @param value Text record containing full record. Output parameter.
 * @return Returns true if read was successful (did not hit EOF).
 *
 * @throws RuntimeException Throws exception if FASTQ record doesn't
 *   have proper formatting (e.g., record doesn't start with @).
 */
protected boolean lowLevelFastqRead(Text readName, Text value) throws IOException {
    // ID line
    readName.clear();
    long skipped = appendLineInto(readName, true);
    pos += skipped;
    if (skipped == 0) {
        return false; // EOF
    }

    if (readName.getBytes()[0] != '@') {
        throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage()
                + ". Line: " + readName + ". \n");
    }

    value.append(readName.getBytes(), 0, readName.getLength());

    // sequence
    appendLineInto(value, false);

    // separator line
    appendLineInto(value, false);

    // quality
    appendLineInto(value, false);

    return true;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line./*  w w  w.ja  va  2  s .  c  o  m*/
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH);

    if (bytesRead < 0 || (bytesRead == 0 && !eofOk))
        throw new EOFException();

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    pos += bytesRead;

    return bytesRead;
}