Example usage for org.apache.hadoop.io Text set

List of usage examples for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(byte[] utf8, int start, int len) 

Source Link

Document

Set the Text to range of bytes

Usage

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static void consumeMap(Text text, Map<String, String> tags) {
    tags.clear();/*from   ww  w  . j a v a 2s.  c o  m*/
    if (text.getLength() > 0) {
        byte[] tagsBytes = text.getBytes();
        if (tagsBytes[0] != Separators[MapStart])
            return;
        int i1 = 1;
        while (i1 < text.getLength() && tagsBytes[i1] != Separators[MapEnd]) {
            int i2 = i1 + 1;
            while (i2 < text.getLength() && tagsBytes[i2] != Separators[KeyValueSeparator])
                i2++;
            String key = new String(tagsBytes, i1, i2 - i1);
            i1 = i2 + 1;

            i2 = i1 + 1;
            while (i2 < text.getLength() && tagsBytes[i2] != Separators[FieldSeparator]
                    && tagsBytes[i2] != Separators[MapEnd])
                i2++;
            String value = new String(tagsBytes, i1, i2 - i1);
            tags.put(key, value);
            i1 = i2;
            if (i1 < text.getLength() && tagsBytes[i1] == Separators[FieldSeparator])
                i1++;
        }
        if (i1 < text.getLength())
            text.set(tagsBytes, i1, text.getLength() - i1);
    }
}

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static OGCGeometry consumeGeometryESRI(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    int i_shape = 0;
    while (!wkt && i_shape < ShapeNames.length) {
        byte[] shapeName = ShapeNames[i_shape];
        if (length > shapeName.length) {
            int i = 0;
            while (i < shapeName.length && shapeName[i] == bytes[i])
                i++;/*from ww w .  ja  v  a  2 s .c  om*/
            if (i == shapeName.length) {
                wkt = true;
                break;
            }
        }
        i_shape++;
    }

    // Look for the terminator of the shape text
    int i1 = 0;
    if (bytes[i1] == '\'' || bytes[i1] == '\"') {
        separator = (char) bytes[i1++];
    }
    int i2 = i1;
    while (i2 < length && bytes[i2] != separator)
        i2++;

    String str = new String(bytes, i1, i2 - i1);

    // Remove consumed bytes from the text
    text.set(bytes, i2, text.getLength() - i2);

    OGCGeometry geom = parseText(str);

    return geom;
}

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static synchronized Geometry consumeGeometryJTS(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    Geometry geom;//ww w  .  ja va2  s. c om
    int i1, i2; // Start and end offset of the geometry being parsed
    int i_next; // Beginning of the next field
    boolean isWKT = false;
    boolean isHex = false;
    if (bytes[0] == '\'' || bytes[0] == '\"') {
        // A quoted string. Find terminating quote and trim the quotes
        i1 = 1;
        i2 = 2;
        while (i2 < length && bytes[i2] != bytes[0])
            i2++;
        if (i2 == length)
            throw new RuntimeException("Unterminated quoted string");
        i_next = i2 + 1;
        i2--; // Back one step to remove the terminating quote
        isWKT = true; // Assume any quoted string to be WKT
    } else {
        // Not a quoted string, check if the type is WKT
        int i_shape = 0;
        while (!wkt && i_shape < ShapeNames.length) {
            byte[] shapeName = ShapeNames[i_shape];
            if (length > shapeName.length) {
                int i = 0;
                while (i < shapeName.length && shapeName[i] == bytes[i])
                    i++;
                if (i == shapeName.length) {
                    wkt = true;
                    break;
                }
            }
            i_shape++;
        }

        if (i_shape < ShapeNames.length) {
            isWKT = true;
            // Look for the terminator of the shape text
            i1 = 0;
            i2 = 1;
            // Search for the first open parenthesis
            while (i2 < length && bytes[i2] != '(')
                i2++;
            if (i2 < length)
                i2++; // Skip the open parenthesis itself
            int nesting = 1;
            while (i2 < length && nesting > 0) {
                if (bytes[i2] == '(')
                    nesting++;
                else if (bytes[i2] == ')')
                    nesting--;
                i2++;
            }
            i_next = i2 + 1;
        } else {
            // Check if the type is hex-encoded WKB
            i1 = 0;
            i2 = 0;
            while (i2 < length && IsHex[bytes[i2]])
                i2++;
            isHex = i2 > 1;
            i_next = i2;
        }
    }

    String geom_text = new String(bytes, i1, i2);

    try {
        if (isWKT) {
            geom = wktReader.read(geom_text);
        } else if (isHex) {
            byte[] binary = hexToBytes(geom_text);
            geom = wkbReader.read(binary);
        } else {
            geom = null;
        }
    } catch (ParseException e) {
        throw new RuntimeException(String.format("Error parsing '%s'", geom_text), e);
    }

    // Remove consumed bytes from the text
    if (i_next >= text.getLength())
        text.clear();
    else {
        if (bytes[i_next] == separator)
            i_next++;
        text.set(bytes, i_next, length - i_next);
    }

    return geom;
}

From source file:edu.umn.cs.spatialHadoop.nasa.NASAPoint.java

License:Open Source License

@Override
public void fromText(Text text) {
    super.fromText(text);
    byte[] bytes = text.getBytes();
    text.set(bytes, 1, text.getLength() - 1);
    value = TextSerializerHelper.consumeInt(text, ',');
    timestamp = TextSerializerHelper.consumeLong(text, '\0');
}

From source file:fm.last.darling.hbase.HBaseJSONOutputReader.java

License:Apache License

private void interpretKeyandValue(byte[] line, int length) throws IOException {
    // Need to find numKeyFields separators
    int pos = UTF8ByteArrayUtils.findBytes(line, 0, length, separator);
    for (int k = 1; k < numKeyFields && pos != -1; k++) {
        pos = UTF8ByteArrayUtils.findBytes(line, pos + separator.length, length, separator);
    }//from w w w  .  j  ava 2s  .com

    Text k = new Text();
    Text v = new Text();
    try {
        if (pos == -1) {
            k.set(line, 0, length);
            v.set("");
        } else {
            StreamKeyValUtil.splitKeyVal(line, 0, length, k, v, pos, separator.length);
        }
    } catch (CharacterCodingException e) {
        throw new IOException(e);
    }

    // removing a ' at the start and end of the key
    byte[] keyBytes = trimOuterBytes(k);

    rowkey = new ImmutableBytesWritable(keyBytes);
    put = new Put(keyBytes);

    String tmpV = v.toString();
    String json = tmpV.substring(1, tmpV.length() - 1);
    Map<String, Map> payload;
    try {
        payload = (Map<String, Map>) ObjectBuilder.fromJSON(json); // the 'erased' type?
    } catch (Exception e) {
        throw new IOException("error, fromJson: ", e);
    }

    Set<Map.Entry<String, Map>> entries = payload.entrySet();
    for (Map.Entry<String, Map> entry : entries) {
        String cfq = entry.getKey(); // let's consider not joining family and qualifier at emitter.
        String[] parts = cfq.split(":");
        if (parts.length < 2)
            continue;
        String family = parts[0];
        String qualifier = parts[1];

        Map dict = entry.getValue(); // unchecked.

        // expecting dict to carry 'value',
        Object value = dict.get("value");
        if (value == null)
            continue; // no good.

        // ..and possibly 'timestamp'.
        //Object ts = 0;
        //if (dict.containsKey("timestamp"))
        //ts = dict.get("timestamp");

        put.add(family.getBytes("UTF-8"), qualifier.getBytes("UTF-8"), value.toString().getBytes("UTF-8"));
    }
}

From source file:hivemall.sketch.bloom.BloomFilterUtils.java

License:Apache License

@Nonnull
public static Text serialize(@Nonnull final Filter filter, @Nonnull final Text dst) throws IOException {
    FastByteArrayOutputStream bos = new FastByteArrayOutputStream();
    Base91OutputStream base91 = new Base91OutputStream(bos);
    DataOutputStream out = new DataOutputStream(base91);
    filter.write(out);/*from   w  w w .ja  va 2  s  .  co  m*/
    out.flush();
    base91.finish();
    dst.set(bos.getInternalArray(), 0, bos.size());
    return dst;
}

From source file:hivemall.utils.hadoop.JsonSerdeUtils.java

License:Apache License

private static void serializePrimitive(@Nonnull final StringBuilder sb, @Nullable final Object obj,
        @Nullable final PrimitiveObjectInspector poi) throws SerDeException {
    if (obj == null) {
        sb.append("null");
    } else {//from w w  w. j  ava 2 s .c o m
        switch (poi.getPrimitiveCategory()) {
        case BOOLEAN: {
            boolean b = ((BooleanObjectInspector) poi).get(obj);
            sb.append(b ? "true" : "false");
            break;
        }
        case BYTE: {
            sb.append(((ByteObjectInspector) poi).get(obj));
            break;
        }
        case SHORT: {
            sb.append(((ShortObjectInspector) poi).get(obj));
            break;
        }
        case INT: {
            sb.append(((IntObjectInspector) poi).get(obj));
            break;
        }
        case LONG: {
            sb.append(((LongObjectInspector) poi).get(obj));
            break;
        }
        case FLOAT: {
            sb.append(((FloatObjectInspector) poi).get(obj));
            break;
        }
        case DOUBLE: {
            sb.append(((DoubleObjectInspector) poi).get(obj));
            break;
        }
        case STRING: {
            String s = SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(obj));
            appendWithQuotes(sb, s);
            break;
        }
        case BINARY:
            byte[] b = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(obj);
            Text txt = new Text();
            txt.set(b, 0, b.length);
            appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString()));
            break;
        case DATE:
            Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(obj);
            appendWithQuotes(sb, d.toString());
            break;
        case TIMESTAMP: {
            Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(obj);
            appendWithQuotes(sb, t.toString());
            break;
        }
        case DECIMAL:
            sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(obj));
            break;
        case VARCHAR: {
            String s = SerDeUtils
                    .escapeString(((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(obj).toString());
            appendWithQuotes(sb, s);
            break;
        }
        case CHAR: {
            //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13)
            // HiveChar.toString() returns getPaddedValue()
            String s = SerDeUtils
                    .escapeString(((HiveCharObjectInspector) poi).getPrimitiveJavaObject(obj).toString());
            appendWithQuotes(sb, s);
            break;
        }
        default:
            throw new SerDeException("Unknown primitive type: " + poi.getPrimitiveCategory());
        }
    }
}

From source file:io.fluo.core.util.ByteUtil.java

License:Apache License

/**
 * Convert from Bytes to Hadoop Text object
 * //from www. j av a2 s .c  om
 * @param b Bytes
 * @return Text object
 */
public static Text toText(Bytes b) {
    if (b.isBackedByArray()) {
        Text t = new Text(EMPTY);
        t.set(b.getBackingArray(), b.offset(), b.length());
        return t;
    } else {
        return new Text(b.toArray());
    }
}

From source file:mapred.io.CustomRecordReader.java

License:Apache License

private int skipUtfByteOrderMark() throws IOException {
    Text value = new Text();
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;/* ww  w .  j av  a 2s .co  m*/
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
            && (textBytes[2] == (byte) 0xBF)) {
        // find UTF-8 BOM, strip it.
        LOG.info("Found UTF-8 BOM and skipped it");
        textLength -= 3;
        newSize -= 3;
        if (textLength > 0) {
            // It may work to use the same buffer and not do the copyBytes
            textBytes = value.copyBytes();
            value.set(textBytes, 3, textLength);
        } else {
            value.clear();
        }
    }
    return newSize;
}

From source file:mr.MyFileRecordReader2.java

License:Apache License

private int skipUtfByteOrderMark(Text value) throws IOException {
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;//from w  w  w . j  a  va2 s  .c  om
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
            && (textBytes[2] == (byte) 0xBF)) {
        // find UTF-8 BOM, strip it.
        LOG.info("Found UTF-8 BOM and skipped it");
        textLength -= 3;
        newSize -= 3;
        if (textLength > 0) {
            // It may work to use the same buffer and not do the copyBytes
            textBytes = value.copyBytes();
            value.set(textBytes, 3, textLength);
        } else {
            value.clear();
        }
    }
    return newSize;
}