Example usage for org.apache.hadoop.io Text decode

List of usage examples for org.apache.hadoop.io Text decode

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text decode.

Prototype

public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException 

Source Link

Usage

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java

License:Open Source License

@Override
public boolean nextKeyValue() throws IOException, CharacterCodingException {
    if (!lineRR.nextKeyValue())
        return false;

    Text line = getCurrentValue();
    int tabOne = line.find("\t");

    int rid = Integer.parseInt(Text.decode(line.getBytes(), 0, tabOne));

    int tabTwo = line.find("\t", tabOne + 1);
    int posBeg = tabOne + 1;
    int posEnd = tabTwo - 1;

    int pos = Integer.parseInt(Text.decode(line.getBytes(), posBeg, posEnd - posBeg + 1));

    key.set(BAMRecordReader.getKey0(rid, pos));
    return true;/*from w w w  . ja v a 2  s. c o  m*/
}

From source file:hivemall.utils.hadoop.JsonSerdeUtils.java

License:Apache License

@SuppressWarnings("deprecation")
@Nullable/*from  w ww  .  j  a v a  2 s.  c  o  m*/
private static Object extractCurrentField(@Nonnull final JsonParser p,
        @Nonnull final HCatFieldSchema hcatFieldSchema, final boolean isTokenCurrent) throws IOException {
    JsonToken valueToken;
    if (isTokenCurrent) {
        valueToken = p.getCurrentToken();
    } else {
        valueToken = p.nextToken();
    }

    final Object val;
    switch (hcatFieldSchema.getType()) {
    case INT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue();
        break;
    case TINYINT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue();
        break;
    case SMALLINT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue();
        break;
    case BIGINT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue();
        break;
    case BOOLEAN:
        String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
        if (bval != null) {
            val = Boolean.valueOf(bval);
        } else {
            val = null;
        }
        break;
    case FLOAT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue();
        break;
    case DOUBLE:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue();
        break;
    case STRING:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
        break;
    case BINARY:
        String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
        if (b != null) {
            try {
                String t = Text.decode(b.getBytes(), 0, b.getBytes().length);
                return t.getBytes();
            } catch (CharacterCodingException e) {
                throw new IOException("Error generating json binary type from object.", e);
            }
        } else {
            val = null;
        }
        break;
    case DATE:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText());
        break;
    case TIMESTAMP:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : Timestamp.valueOf(p.getText());
        break;
    case DECIMAL:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText());
        break;
    case VARCHAR:
        int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
        val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen);
        break;
    case CHAR:
        int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
        val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen);
        break;
    case ARRAY:
        if (valueToken == JsonToken.VALUE_NULL) {
            val = null;
            break;
        }
        if (valueToken != JsonToken.START_ARRAY) {
            throw new IOException("Start of Array expected");
        }
        final List<Object> arr = new ArrayList<>();
        final HCatFieldSchema elemSchema = hcatFieldSchema.getArrayElementSchema().get(0);
        while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) {
            arr.add(extractCurrentField(p, elemSchema, true));
        }
        val = arr;
        break;
    case MAP:
        if (valueToken == JsonToken.VALUE_NULL) {
            val = null;
            break;
        }
        if (valueToken != JsonToken.START_OBJECT) {
            throw new IOException("Start of Object expected");
        }
        final Map<Object, Object> map = new LinkedHashMap<>();
        final HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0);
        while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
            Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(),
                    hcatFieldSchema.getMapKeyTypeInfo());
            Object v = extractCurrentField(p, valueSchema, false);
            map.put(k, v);
        }
        val = map;
        break;
    case STRUCT:
        if (valueToken == JsonToken.VALUE_NULL) {
            val = null;
            break;
        }
        if (valueToken != JsonToken.START_OBJECT) {
            throw new IOException("Start of Object expected");
        }
        HCatSchema subSchema = hcatFieldSchema.getStructSubSchema();
        int sz = subSchema.getFieldNames().size();

        List<Object> struct = new ArrayList<>(Collections.nCopies(sz, null));
        while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
            populateRecord(struct, valueToken, p, subSchema);
        }
        val = struct;
        break;
    default:
        throw new IOException("Unknown type found: " + hcatFieldSchema.getType());
    }
    return val;
}

From source file:hivemall.utils.hadoop.JsonSerdeUtils.java

License:Apache License

@Nonnull
private static Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType)
        throws IOException {
    switch (Type.getPrimitiveHType(mapKeyType)) {
    case INT://ww  w .  j av a2 s  .c  o m
        return Integer.valueOf(s);
    case TINYINT:
        return Byte.valueOf(s);
    case SMALLINT:
        return Short.valueOf(s);
    case BIGINT:
        return Long.valueOf(s);
    case BOOLEAN:
        return (s.equalsIgnoreCase("true"));
    case FLOAT:
        return Float.valueOf(s);
    case DOUBLE:
        return Double.valueOf(s);
    case STRING:
        return s;
    case BINARY:
        try {
            String t = Text.decode(s.getBytes(), 0, s.getBytes().length);
            return t.getBytes();
        } catch (CharacterCodingException e) {
            throw new IOException("Error generating json binary type from object.", e);
        }
    case DATE:
        return Date.valueOf(s);
    case TIMESTAMP:
        return Timestamp.valueOf(s);
    case DECIMAL:
        return HiveDecimal.create(s);
    case VARCHAR:
        return new HiveVarchar(s, ((BaseCharTypeInfo) mapKeyType).getLength());
    case CHAR:
        return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength());
    default:
        throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName());
    }
}

From source file:it.crs4.seal.common.CutText.java

License:Open Source License

public void loadRecord(Text record) throws FormatException {
    int pos = 0; // the byte position within the record
    int fieldno = 0; // the field index within the record
    int colno = 0; // the index within the list of requested fields (columns)
    try {/*from  www  .ja va2 s  . c  o m*/
        while (pos < record.getLength() && colno < columns.size()) // iterate over each field
        {
            int endpos = record.find(delim, pos); // the field's end position
            if (endpos < 0)
                endpos = record.getLength();

            if (columns.get(colno) == fieldno) // if we're at a requested field
            {
                extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos);
                extractedFieldPositions[colno] = pos;
                colno += 1; // advance column
            }

            pos = endpos + 1; // the next starting position is the current end + 1
            fieldno += 1;
        }
    } catch (java.nio.charset.CharacterCodingException e) {
        throw new FormatException("character coding exception.  Message: " + e.getMessage(), record);
    }

    if (colno < columns.size())
        throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.",
                record);
}

From source file:it.crs4.seal.common.TextSamMapping.java

License:Open Source License

protected String getTagText(String name) {
    if (tagsStart >= unparsedData.getLength()) // no tags
        return null;

    String text = null;//from  ww  w .  j  a va2s.c om
    try {
        int pos = unparsedData.find(Delim + name, tagsStart - 1);
        if (pos >= 0) {
            int fieldEnd = unparsedData.find(Delim, pos + 1); // fieldEnd: index one position beyond the last char of the field
            if (fieldEnd < 0)
                fieldEnd = unparsedData.getLength();
            // decode n bytes from start
            //  start = pos + 1 (+1 to skip the delimiter)
            //  n = fieldEnd - start
            //    = fieldEnd - (pos + 1)
            //    = fieldEnd - pos - 1
            text = Text.decode(unparsedData.getBytes(), pos + 1, fieldEnd - pos - 1);
        }
    } catch (java.nio.charset.CharacterCodingException e) {
        throw new RuntimeException(
                "character coding error retrieving tag '" + name + "' from SAM record " + this.toString());
    }

    return text;
}

From source file:it.crs4.seal.prq.PairReadsQSeqMapper.java

License:Open Source License

public void map(Text readId, SequencedFragment read, IMRContext<SequenceId, Text> context)
        throws IOException, InterruptedException {
    // build the key
    builder.delete(0, builder.length());

    // field up and including the index number goes in the location.  The read is on its own.
    if (read.getRead() == null)
        throw new RuntimeException("Cannot get read number from read: " + readId);

    if (read.getLane() != null && read.getTile() != null && read.getXpos() != null && read.getYpos() != null) {
        appendIdToBuilder(builder, read); // appends the read id to the builder provided
        // finally the index field
        builder.append("#").append(read.getIndexSequence() == null ? '0' : read.getIndexSequence());
        sequenceKey.set(builder.toString(), read.getRead());
    } else {/*  w  w  w  . ja  v a  2 s .  com*/
        // maybe it's a fastq id with a trailing read number (/1 or /2)
        if (readId.getLength() > 2) {
            int last = readId.getLength() - 1;
            if (readId.charAt(last - 1) == '/') {
                // truncate the /[12] from the read id
                // last == length - 1.  We want length - 2 bytes, which is equal to last - 1
                sequenceKey.set(Text.decode(readId.getBytes(), 0, last - 1), read.getRead());
            } else
                throw new RuntimeException(
                        "Didn't find /read_number at end of the read id.  Please use qseq files or fastq with illumina-formatted name tags.");
        } else
            throw new RuntimeException("Read id " + readId
                    + " is too short.   Please use qseq files or fastq with illumina-formatted name tags.");
    }

    // then the tab-delimited value
    sequenceValue.clear();
    sequenceValue.append(read.getSequence().getBytes(), 0, read.getSequence().getLength());
    sequenceValue.append(Delim, 0, Delim.length);
    sequenceValue.append(read.getQuality().getBytes(), 0, read.getQuality().getLength());
    sequenceValue.append(Delim, 0, Delim.length);
    // the filter flag is optional.  If it's absent we assume the read passes filtering.
    sequenceValue.append(ZeroOne, (read.getFilterPassed() == null || read.getFilterPassed() ? 1 : 0), 1);

    context.write(sequenceKey, sequenceValue);
    context.progress();
}

From source file:it.uniroma1.hadoop.pagerank.job1.PageRankJob1Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* Job#1 mapper will simply parse a line of the input graph creating a map with key-value(s) pairs.
     * Input format is the following (separator is TAB):
     * //from  ww w. j  a  v  a  2s  .  c  o m
     *     <nodeA>    <nodeB>
     * 
     * which denotes an edge going from <nodeA> to <nodeB>.
     * We would need to skip comment lines (denoted by the # characters at the beginning of the line).
     * We will also collect all the distinct nodes in our graph: this is needed to compute the initial 
     * pagerank value in Job #1 reducer and also in later jobs.
     */

    if (value.charAt(0) != '#') {

        int tabIndex = value.find("\t");
        String nodeA = Text.decode(value.getBytes(), 0, tabIndex);
        String nodeB = Text.decode(value.getBytes(), tabIndex + 1, value.getLength() - (tabIndex + 1));
        context.write(new Text(nodeA), new Text(nodeB));

        // add the current source node to the node list so we can 
        // compute the total amount of nodes of our graph in Job#2
        PageRank.NODES.add(nodeA);
        // also add the target node to the same list: we may have a target node 
        // with no outlinks (so it will never be parsed as source)
        PageRank.NODES.add(nodeB);

    }

}

From source file:it.uniroma1.hadoop.pagerank.job2.PageRankJob2Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* PageRank calculation algorithm (mapper)
     * Input file format (separator is TAB):
     * /*from  w  w w  .j av  a 2  s.co  m*/
     *     <title>    <page-rank>    <link1>,<link2>,<link3>,<link4>,... ,<linkN>
     * 
     * Output has 2 kind of records:
     * One record composed by the collection of links of each page:
     *     
     *     <title>   |<link1>,<link2>,<link3>,<link4>, ... , <linkN>
     *     
     * Another record composed by the linked page, the page rank of the source page 
     * and the total amount of out links of the source page:
     *  
     *     <link>    <page-rank>    <total-links>
     */

    int tIdx1 = value.find("\t");
    int tIdx2 = value.find("\t", tIdx1 + 1);

    // extract tokens from the current line
    String page = Text.decode(value.getBytes(), 0, tIdx1);
    String pageRank = Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1));
    String links = Text.decode(value.getBytes(), tIdx2 + 1, value.getLength() - (tIdx2 + 1));

    String[] allOtherPages = links.split(",");
    for (String otherPage : allOtherPages) {
        Text pageRankWithTotalLinks = new Text(pageRank + "\t" + allOtherPages.length);
        context.write(new Text(otherPage), pageRankWithTotalLinks);
    }

    // put the original links so the reducer is able to produce the correct output
    context.write(new Text(page), new Text(PageRank.LINKS_SEPARATOR + links));

}

From source file:it.uniroma1.hadoop.pagerank.job3.PageRankJob3Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* Rank Ordering (mapper only)
     * Input file format (separator is TAB):
     * /*from w  w w  . ja v  a 2 s .c o m*/
     *     <title>    <page-rank>    <link1>,<link2>,<link3>,<link4>,... ,<linkN>
     * 
     * This is a simple job which does the ordering of our documents according to the computed pagerank.
     * We will map the pagerank (key) to its value (page) and Hadoop will do the sorting on keys for us.
     * There is no need to implement a reducer: the mapping and sorting is enough for our purpose.
     */

    int tIdx1 = value.find("\t");
    int tIdx2 = value.find("\t", tIdx1 + 1);

    // extract tokens from the current line
    String page = Text.decode(value.getBytes(), 0, tIdx1);
    float pageRank = Float.parseFloat(Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1)));

    context.write(new DoubleWritable(pageRank), new Text(page));

}

From source file:mvm.rya.indexing.accumulo.freetext.AccumuloFreeTextIndexer.java

License:Apache License

private static CloseableIteration<Statement, QueryEvaluationException> getIteratorWrapper(final Scanner s) {

    final Iterator<Entry<Key, Value>> i = s.iterator();

    return new CloseableIteration<Statement, QueryEvaluationException>() {
        @Override/*from   w  ww . ja  va2  s .  co m*/
        public boolean hasNext() {
            return i.hasNext();
        }

        @Override
        public Statement next() throws QueryEvaluationException {
            Entry<Key, Value> entry = i.next();
            Value v = entry.getValue();
            try {
                String dataString = Text.decode(v.get(), 0, v.getSize());
                Statement s = StatementSerializer.readStatement(dataString);
                return s;
            } catch (CharacterCodingException e) {
                logger.error("Error decoding value", e);
                throw new QueryEvaluationException(e);
            } catch (IOException e) {
                logger.error("Error deserializing statement", e);
                throw new QueryEvaluationException(e);
            }
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("Remove not implemented");
        }

        @Override
        public void close() throws QueryEvaluationException {
            if (s != null) {
                s.close();
            }
        }
    };
}