Example usage for org.apache.hadoop.io Text find

List of usage examples for org.apache.hadoop.io Text find

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text find.

Prototype

public int find(String what, int start) 

Source Link

Document

Finds any occurrence of what in the backing buffer, starting as position start.

Usage

From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java

License:Apache License

public Text getColumn(Text val, int column, String delimiter) throws IOException {
    if (delimiter == null || delimiter.equals("")) {
        throw new IOException("Value of delimiter is empty");
    }/*from www . ja  v  a 2s .  co m*/
    int lastOccurance = 0;
    int occurance = 0;
    for (int i = 0; i < column; i++) {
        occurance = val.find(delimiter, lastOccurance) - lastOccurance;
        lastOccurance = lastOccurance + occurance + delimiter.length();
    }

    logger.debug("text value is: " + val);
    int delimiterLength = delimiter.length();
    int startPosition = lastOccurance - (occurance + delimiterLength);
    Text keyColumn = new Text();
    keyColumn.set(val.getBytes(), startPosition, occurance);
    return keyColumn;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.full.Phase3Step2DistinctDataJobTest.java

License:Apache License

@Test
public void testSplit() throws Exception {
    Text key = new Text("123_456789");

    // hard-split using array copy
    int i = key.find("_", 0);

    Text outputKey = new Text("");
    byte[] bytes = key.getBytes();
    outputKey.append(bytes, i + 1, bytes.length - i - 2);

    String fileName = new String(bytes, 0, i);

    assertEquals("123", fileName);
    assertEquals("456789", outputKey.toString());
}

From source file:it.crs4.seal.common.CutText.java

License:Open Source License

public void loadRecord(Text record) throws FormatException {
    int pos = 0; // the byte position within the record
    int fieldno = 0; // the field index within the record
    int colno = 0; // the index within the list of requested fields (columns)
    try {//from  www .  j  a v a 2s . c  o m
        while (pos < record.getLength() && colno < columns.size()) // iterate over each field
        {
            int endpos = record.find(delim, pos); // the field's end position
            if (endpos < 0)
                endpos = record.getLength();

            if (columns.get(colno) == fieldno) // if we're at a requested field
            {
                extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos);
                extractedFieldPositions[colno] = pos;
                colno += 1; // advance column
            }

            pos = endpos + 1; // the next starting position is the current end + 1
            fieldno += 1;
        }
    } catch (java.nio.charset.CharacterCodingException e) {
        throw new FormatException("character coding exception.  Message: " + e.getMessage(), record);
    }

    if (colno < columns.size())
        throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.",
                record);
}

From source file:it.crs4.seal.prq.PairReadsQSeqReducer.java

License:Open Source License

private int[] findFields(Text read) {
    int[] fieldsPos = new int[3];
    fieldsPos[0] = 0;/*from w  w w.  j  av  a2  s. c  o m*/

    for (int i = 1; i <= 2; ++i) {
        fieldsPos[i] = read.find(delim, fieldsPos[i - 1]) + 1; // +1 since we get the position of the delimiter
        if (fieldsPos[i] <= 0)
            throw new RuntimeException("invalid read/quality format: " + read.toString());
    }

    int seqLength = fieldsPos[1] - 1;
    int qualLength = fieldsPos[2] - fieldsPos[1] - 1;
    if (seqLength != qualLength)
        throw new RuntimeException(
                "sequence and quality lengths don't match! (got " + seqLength + " and " + qualLength + ")");

    return fieldsPos;
}