Example usage for org.apache.hadoop.io Text decode

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text decode.

Prototype

public static String decode(byte[] utf8, int start, int length) throws CharacterCodingException

Source Link

Usage

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java

License:Open Source License

@Override
public boolean nextKeyValue() throws IOException, CharacterCodingException {
    if (!lineRR.nextKeyValue())
        return false;

    Text line = getCurrentValue();
    int tabOne = line.find("\t");

    int rid = Integer.parseInt(Text.decode(line.getBytes(), 0, tabOne));

    int tabTwo = line.find("\t", tabOne + 1);
    int posBeg = tabOne + 1;
    int posEnd = tabTwo - 1;

    int pos = Integer.parseInt(Text.decode(line.getBytes(), posBeg, posEnd - posBeg + 1));

    key.set(BAMRecordReader.getKey0(rid, pos));
    return true;/*from w w w  . ja v a 2  s. c o  m*/
}

From source file:hivemall.utils.hadoop.JsonSerdeUtils.java

License:Apache License

@SuppressWarnings("deprecation")
@Nullable/*from  w ww  .  j  a v a  2 s.  c  o  m*/
private static Object extractCurrentField(@Nonnull final JsonParser p,
        @Nonnull final HCatFieldSchema hcatFieldSchema, final boolean isTokenCurrent) throws IOException {
    JsonToken valueToken;
    if (isTokenCurrent) {
        valueToken = p.getCurrentToken();
    } else {
        valueToken = p.nextToken();
    }

    final Object val;
    switch (hcatFieldSchema.getType()) {
    case INT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getIntValue();
        break;
    case TINYINT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getByteValue();
        break;
    case SMALLINT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getShortValue();
        break;
    case BIGINT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getLongValue();
        break;
    case BOOLEAN:
        String bval = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
        if (bval != null) {
            val = Boolean.valueOf(bval);
        } else {
            val = null;
        }
        break;
    case FLOAT:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getFloatValue();
        break;
    case DOUBLE:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getDoubleValue();
        break;
    case STRING:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
        break;
    case BINARY:
        String b = (valueToken == JsonToken.VALUE_NULL) ? null : p.getText();
        if (b != null) {
            try {
                String t = Text.decode(b.getBytes(), 0, b.getBytes().length);
                return t.getBytes();
            } catch (CharacterCodingException e) {
                throw new IOException("Error generating json binary type from object.", e);
            }
        } else {
            val = null;
        }
        break;
    case DATE:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : Date.valueOf(p.getText());
        break;
    case TIMESTAMP:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : Timestamp.valueOf(p.getText());
        break;
    case DECIMAL:
        val = (valueToken == JsonToken.VALUE_NULL) ? null : HiveDecimal.create(p.getText());
        break;
    case VARCHAR:
        int vLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
        val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveVarchar(p.getText(), vLen);
        break;
    case CHAR:
        int cLen = ((BaseCharTypeInfo) hcatFieldSchema.getTypeInfo()).getLength();
        val = (valueToken == JsonToken.VALUE_NULL) ? null : new HiveChar(p.getText(), cLen);
        break;
    case ARRAY:
        if (valueToken == JsonToken.VALUE_NULL) {
            val = null;
            break;
        }
        if (valueToken != JsonToken.START_ARRAY) {
            throw new IOException("Start of Array expected");
        }
        final List<Object> arr = new ArrayList<>();
        final HCatFieldSchema elemSchema = hcatFieldSchema.getArrayElementSchema().get(0);
        while ((valueToken = p.nextToken()) != JsonToken.END_ARRAY) {
            arr.add(extractCurrentField(p, elemSchema, true));
        }
        val = arr;
        break;
    case MAP:
        if (valueToken == JsonToken.VALUE_NULL) {
            val = null;
            break;
        }
        if (valueToken != JsonToken.START_OBJECT) {
            throw new IOException("Start of Object expected");
        }
        final Map<Object, Object> map = new LinkedHashMap<>();
        final HCatFieldSchema valueSchema = hcatFieldSchema.getMapValueSchema().get(0);
        while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
            Object k = getObjectOfCorrespondingPrimitiveType(p.getCurrentName(),
                    hcatFieldSchema.getMapKeyTypeInfo());
            Object v = extractCurrentField(p, valueSchema, false);
            map.put(k, v);
        }
        val = map;
        break;
    case STRUCT:
        if (valueToken == JsonToken.VALUE_NULL) {
            val = null;
            break;
        }
        if (valueToken != JsonToken.START_OBJECT) {
            throw new IOException("Start of Object expected");
        }
        HCatSchema subSchema = hcatFieldSchema.getStructSubSchema();
        int sz = subSchema.getFieldNames().size();

        List<Object> struct = new ArrayList<>(Collections.nCopies(sz, null));
        while ((valueToken = p.nextToken()) != JsonToken.END_OBJECT) {
            populateRecord(struct, valueToken, p, subSchema);
        }
        val = struct;
        break;
    default:
        throw new IOException("Unknown type found: " + hcatFieldSchema.getType());
    }
    return val;
}

From source file:hivemall.utils.hadoop.JsonSerdeUtils.java

License:Apache License

@Nonnull
private static Object getObjectOfCorrespondingPrimitiveType(String s, PrimitiveTypeInfo mapKeyType)
        throws IOException {
    switch (Type.getPrimitiveHType(mapKeyType)) {
    case INT://ww  w .  j av a2 s  .c  o m
        return Integer.valueOf(s);
    case TINYINT:
        return Byte.valueOf(s);
    case SMALLINT:
        return Short.valueOf(s);
    case BIGINT:
        return Long.valueOf(s);
    case BOOLEAN:
        return (s.equalsIgnoreCase("true"));
    case FLOAT:
        return Float.valueOf(s);
    case DOUBLE:
        return Double.valueOf(s);
    case STRING:
        return s;
    case BINARY:
        try {
            String t = Text.decode(s.getBytes(), 0, s.getBytes().length);
            return t.getBytes();
        } catch (CharacterCodingException e) {
            throw new IOException("Error generating json binary type from object.", e);
        }
    case DATE:
        return Date.valueOf(s);
    case TIMESTAMP:
        return Timestamp.valueOf(s);
    case DECIMAL:
        return HiveDecimal.create(s);
    case VARCHAR:
        return new HiveVarchar(s, ((BaseCharTypeInfo) mapKeyType).getLength());
    case CHAR:
        return new HiveChar(s, ((BaseCharTypeInfo) mapKeyType).getLength());
    default:
        throw new IOException("Could not convert from string to map type " + mapKeyType.getTypeName());
    }
}

From source file:it.crs4.seal.common.CutText.java

License:Open Source License

public void loadRecord(Text record) throws FormatException {
    int pos = 0; // the byte position within the record
    int fieldno = 0; // the field index within the record
    int colno = 0; // the index within the list of requested fields (columns)
    try {/*from  www  .ja va2 s  . c  o m*/
        while (pos < record.getLength() && colno < columns.size()) // iterate over each field
        {
            int endpos = record.find(delim, pos); // the field's end position
            if (endpos < 0)
                endpos = record.getLength();

            if (columns.get(colno) == fieldno) // if we're at a requested field
            {
                extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos);
                extractedFieldPositions[colno] = pos;
                colno += 1; // advance column
            }

            pos = endpos + 1; // the next starting position is the current end + 1
            fieldno += 1;
        }
    } catch (java.nio.charset.CharacterCodingException e) {
        throw new FormatException("character coding exception.  Message: " + e.getMessage(), record);
    }

    if (colno < columns.size())
        throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.",
                record);
}

From source file:it.crs4.seal.common.TextSamMapping.java

License:Open Source License

protected String getTagText(String name) {
    if (tagsStart >= unparsedData.getLength()) // no tags
        return null;

    String text = null;//from  ww  w .  j  a va2s.c om
    try {
        int pos = unparsedData.find(Delim + name, tagsStart - 1);
        if (pos >= 0) {
            int fieldEnd = unparsedData.find(Delim, pos + 1); // fieldEnd: index one position beyond the last char of the field
            if (fieldEnd < 0)
                fieldEnd = unparsedData.getLength();
            // decode n bytes from start
            //  start = pos + 1 (+1 to skip the delimiter)
            //  n = fieldEnd - start
            //    = fieldEnd - (pos + 1)
            //    = fieldEnd - pos - 1
            text = Text.decode(unparsedData.getBytes(), pos + 1, fieldEnd - pos - 1);
        }
    } catch (java.nio.charset.CharacterCodingException e) {
        throw new RuntimeException(
                "character coding error retrieving tag '" + name + "' from SAM record " + this.toString());
    }

    return text;
}

From source file:it.crs4.seal.prq.PairReadsQSeqMapper.java

License:Open Source License

public void map(Text readId, SequencedFragment read, IMRContext<SequenceId, Text> context)
        throws IOException, InterruptedException {
    // build the key
    builder.delete(0, builder.length());

    // field up and including the index number goes in the location.  The read is on its own.
    if (read.getRead() == null)
        throw new RuntimeException("Cannot get read number from read: " + readId);

    if (read.getLane() != null && read.getTile() != null && read.getXpos() != null && read.getYpos() != null) {
        appendIdToBuilder(builder, read); // appends the read id to the builder provided
        // finally the index field
        builder.append("#").append(read.getIndexSequence() == null ? '0' : read.getIndexSequence());
        sequenceKey.set(builder.toString(), read.getRead());
    } else {/*  w  w  w  . ja  v a  2 s .  com*/
        // maybe it's a fastq id with a trailing read number (/1 or /2)
        if (readId.getLength() > 2) {
            int last = readId.getLength() - 1;
            if (readId.charAt(last - 1) == '/') {
                // truncate the /[12] from the read id
                // last == length - 1.  We want length - 2 bytes, which is equal to last - 1
                sequenceKey.set(Text.decode(readId.getBytes(), 0, last - 1), read.getRead());
            } else
                throw new RuntimeException(
                        "Didn't find /read_number at end of the read id.  Please use qseq files or fastq with illumina-formatted name tags.");
        } else
            throw new RuntimeException("Read id " + readId
                    + " is too short.   Please use qseq files or fastq with illumina-formatted name tags.");
    }

    // then the tab-delimited value
    sequenceValue.clear();
    sequenceValue.append(read.getSequence().getBytes(), 0, read.getSequence().getLength());
    sequenceValue.append(Delim, 0, Delim.length);
    sequenceValue.append(read.getQuality().getBytes(), 0, read.getQuality().getLength());
    sequenceValue.append(Delim, 0, Delim.length);
    // the filter flag is optional.  If it's absent we assume the read passes filtering.
    sequenceValue.append(ZeroOne, (read.getFilterPassed() == null || read.getFilterPassed() ? 1 : 0), 1);

    context.write(sequenceKey, sequenceValue);
    context.progress();
}

From source file:it.uniroma1.hadoop.pagerank.job1.PageRankJob1Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* Job#1 mapper will simply parse a line of the input graph creating a map with key-value(s) pairs.
     * Input format is the following (separator is TAB):
     * //from  ww w. j  a  v  a  2s  .  c  o m
     *     <nodeA>    <nodeB>
     * 
     * which denotes an edge going from <nodeA> to <nodeB>.
     * We would need to skip comment lines (denoted by the # characters at the beginning of the line).
     * We will also collect all the distinct nodes in our graph: this is needed to compute the initial 
     * pagerank value in Job #1 reducer and also in later jobs.
     */

    if (value.charAt(0) != '#') {

        int tabIndex = value.find("\t");
        String nodeA = Text.decode(value.getBytes(), 0, tabIndex);
        String nodeB = Text.decode(value.getBytes(), tabIndex + 1, value.getLength() - (tabIndex + 1));
        context.write(new Text(nodeA), new Text(nodeB));

        // add the current source node to the node list so we can 
        // compute the total amount of nodes of our graph in Job#2
        PageRank.NODES.add(nodeA);
        // also add the target node to the same list: we may have a target node 
        // with no outlinks (so it will never be parsed as source)
        PageRank.NODES.add(nodeB);

    }

}

From source file:it.uniroma1.hadoop.pagerank.job2.PageRankJob2Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* PageRank calculation algorithm (mapper)
     * Input file format (separator is TAB):
     * /*from  w  w w  .j av  a 2  s.co  m*/
     *     <title>    <page-rank>    <link1>,<link2>,<link3>,<link4>,... ,<linkN>
     * 
     * Output has 2 kind of records:
     * One record composed by the collection of links of each page:
     *     
     *     <title>   |<link1>,<link2>,<link3>,<link4>, ... , <linkN>
     *     
     * Another record composed by the linked page, the page rank of the source page 
     * and the total amount of out links of the source page:
     *  
     *     <link>    <page-rank>    <total-links>
     */

    int tIdx1 = value.find("\t");
    int tIdx2 = value.find("\t", tIdx1 + 1);

    // extract tokens from the current line
    String page = Text.decode(value.getBytes(), 0, tIdx1);
    String pageRank = Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1));
    String links = Text.decode(value.getBytes(), tIdx2 + 1, value.getLength() - (tIdx2 + 1));

    String[] allOtherPages = links.split(",");
    for (String otherPage : allOtherPages) {
        Text pageRankWithTotalLinks = new Text(pageRank + "\t" + allOtherPages.length);
        context.write(new Text(otherPage), pageRankWithTotalLinks);
    }

    // put the original links so the reducer is able to produce the correct output
    context.write(new Text(page), new Text(PageRank.LINKS_SEPARATOR + links));

}

From source file:it.uniroma1.hadoop.pagerank.job3.PageRankJob3Mapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    /* Rank Ordering (mapper only)
     * Input file format (separator is TAB):
     * /*from w  w w  . ja v  a 2 s .c o m*/
     *     <title>    <page-rank>    <link1>,<link2>,<link3>,<link4>,... ,<linkN>
     * 
     * This is a simple job which does the ordering of our documents according to the computed pagerank.
     * We will map the pagerank (key) to its value (page) and Hadoop will do the sorting on keys for us.
     * There is no need to implement a reducer: the mapping and sorting is enough for our purpose.
     */

    int tIdx1 = value.find("\t");
    int tIdx2 = value.find("\t", tIdx1 + 1);

    // extract tokens from the current line
    String page = Text.decode(value.getBytes(), 0, tIdx1);
    float pageRank = Float.parseFloat(Text.decode(value.getBytes(), tIdx1 + 1, tIdx2 - (tIdx1 + 1)));

    context.write(new DoubleWritable(pageRank), new Text(page));

}

From source file:mvm.rya.indexing.accumulo.freetext.AccumuloFreeTextIndexer.java

License:Apache License

private static CloseableIteration<Statement, QueryEvaluationException> getIteratorWrapper(final Scanner s) {

    final Iterator<Entry<Key, Value>> i = s.iterator();

    return new CloseableIteration<Statement, QueryEvaluationException>() {
        @Override/*from   w  ww . ja  va2  s .  co m*/
        public boolean hasNext() {
            return i.hasNext();
        }

        @Override
        public Statement next() throws QueryEvaluationException {
            Entry<Key, Value> entry = i.next();
            Value v = entry.getValue();
            try {
                String dataString = Text.decode(v.get(), 0, v.getSize());
                Statement s = StatementSerializer.readStatement(dataString);
                return s;
            } catch (CharacterCodingException e) {
                logger.error("Error decoding value", e);
                throw new QueryEvaluationException(e);
            } catch (IOException e) {
                logger.error("Error deserializing statement", e);
                throw new QueryEvaluationException(e);
            }
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException("Remove not implemented");
        }

        @Override
        public void close() throws QueryEvaluationException {
            if (s != null) {
                s.close();
            }
        }
    };
}