Example usage for org.apache.hadoop.io Text set

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text set.

Prototype

public void set(byte[] utf8, int start, int len)

Source Link

Document

Set the Text to range of bytes

Usage

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static void consumeMap(Text text, Map<String, String> tags) {
    tags.clear();/*from   ww  w  . j a v a 2s.  c o  m*/
    if (text.getLength() > 0) {
        byte[] tagsBytes = text.getBytes();
        if (tagsBytes[0] != Separators[MapStart])
            return;
        int i1 = 1;
        while (i1 < text.getLength() && tagsBytes[i1] != Separators[MapEnd]) {
            int i2 = i1 + 1;
            while (i2 < text.getLength() && tagsBytes[i2] != Separators[KeyValueSeparator])
                i2++;
            String key = new String(tagsBytes, i1, i2 - i1);
            i1 = i2 + 1;

            i2 = i1 + 1;
            while (i2 < text.getLength() && tagsBytes[i2] != Separators[FieldSeparator]
                    && tagsBytes[i2] != Separators[MapEnd])
                i2++;
            String value = new String(tagsBytes, i1, i2 - i1);
            tags.put(key, value);
            i1 = i2;
            if (i1 < text.getLength() && tagsBytes[i1] == Separators[FieldSeparator])
                i1++;
        }
        if (i1 < text.getLength())
            text.set(tagsBytes, i1, text.getLength() - i1);
    }
}

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static OGCGeometry consumeGeometryESRI(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    int i_shape = 0;
    while (!wkt && i_shape < ShapeNames.length) {
        byte[] shapeName = ShapeNames[i_shape];
        if (length > shapeName.length) {
            int i = 0;
            while (i < shapeName.length && shapeName[i] == bytes[i])
                i++;/*from ww w .  ja  v  a  2 s .c  om*/
            if (i == shapeName.length) {
                wkt = true;
                break;
            }
        }
        i_shape++;
    }

    // Look for the terminator of the shape text
    int i1 = 0;
    if (bytes[i1] == '\'' || bytes[i1] == '\"') {
        separator = (char) bytes[i1++];
    }
    int i2 = i1;
    while (i2 < length && bytes[i2] != separator)
        i2++;

    String str = new String(bytes, i1, i2 - i1);

    // Remove consumed bytes from the text
    text.set(bytes, i2, text.getLength() - i2);

    OGCGeometry geom = parseText(str);

    return geom;
}

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static synchronized Geometry consumeGeometryJTS(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    Geometry geom;//ww w  .  ja va2  s. c om
    int i1, i2; // Start and end offset of the geometry being parsed
    int i_next; // Beginning of the next field
    boolean isWKT = false;
    boolean isHex = false;
    if (bytes[0] == '\'' || bytes[0] == '\"') {
        // A quoted string. Find terminating quote and trim the quotes
        i1 = 1;
        i2 = 2;
        while (i2 < length && bytes[i2] != bytes[0])
            i2++;
        if (i2 == length)
            throw new RuntimeException("Unterminated quoted string");
        i_next = i2 + 1;
        i2--; // Back one step to remove the terminating quote
        isWKT = true; // Assume any quoted string to be WKT
    } else {
        // Not a quoted string, check if the type is WKT
        int i_shape = 0;
        while (!wkt && i_shape < ShapeNames.length) {
            byte[] shapeName = ShapeNames[i_shape];
            if (length > shapeName.length) {
                int i = 0;
                while (i < shapeName.length && shapeName[i] == bytes[i])
                    i++;
                if (i == shapeName.length) {
                    wkt = true;
                    break;
                }
            }
            i_shape++;
        }

        if (i_shape < ShapeNames.length) {
            isWKT = true;
            // Look for the terminator of the shape text
            i1 = 0;
            i2 = 1;
            // Search for the first open parenthesis
            while (i2 < length && bytes[i2] != '(')
                i2++;
            if (i2 < length)
                i2++; // Skip the open parenthesis itself
            int nesting = 1;
            while (i2 < length && nesting > 0) {
                if (bytes[i2] == '(')
                    nesting++;
                else if (bytes[i2] == ')')
                    nesting--;
                i2++;
            }
            i_next = i2 + 1;
        } else {
            // Check if the type is hex-encoded WKB
            i1 = 0;
            i2 = 0;
            while (i2 < length && IsHex[bytes[i2]])
                i2++;
            isHex = i2 > 1;
            i_next = i2;
        }
    }

    String geom_text = new String(bytes, i1, i2);

    try {
        if (isWKT) {
            geom = wktReader.read(geom_text);
        } else if (isHex) {
            byte[] binary = hexToBytes(geom_text);
            geom = wkbReader.read(binary);
        } else {
            geom = null;
        }
    } catch (ParseException e) {
        throw new RuntimeException(String.format("Error parsing '%s'", geom_text), e);
    }

    // Remove consumed bytes from the text
    if (i_next >= text.getLength())
        text.clear();
    else {
        if (bytes[i_next] == separator)
            i_next++;
        text.set(bytes, i_next, length - i_next);
    }

    return geom;
}

From source file:edu.umn.cs.spatialHadoop.nasa.NASAPoint.java

License:Open Source License

@Override
public void fromText(Text text) {
    super.fromText(text);
    byte[] bytes = text.getBytes();
    text.set(bytes, 1, text.getLength() - 1);
    value = TextSerializerHelper.consumeInt(text, ',');
    timestamp = TextSerializerHelper.consumeLong(text, '\0');
}

From source file:fm.last.darling.hbase.HBaseJSONOutputReader.java

License:Apache License

private void interpretKeyandValue(byte[] line, int length) throws IOException {
    // Need to find numKeyFields separators
    int pos = UTF8ByteArrayUtils.findBytes(line, 0, length, separator);
    for (int k = 1; k < numKeyFields && pos != -1; k++) {
        pos = UTF8ByteArrayUtils.findBytes(line, pos + separator.length, length, separator);
    }//from w w w  .  j  ava 2s  .com

    Text k = new Text();
    Text v = new Text();
    try {
        if (pos == -1) {
            k.set(line, 0, length);
            v.set("");
        } else {
            StreamKeyValUtil.splitKeyVal(line, 0, length, k, v, pos, separator.length);
        }
    } catch (CharacterCodingException e) {
        throw new IOException(e);
    }

    // removing a ' at the start and end of the key
    byte[] keyBytes = trimOuterBytes(k);

    rowkey = new ImmutableBytesWritable(keyBytes);
    put = new Put(keyBytes);

    String tmpV = v.toString();
    String json = tmpV.substring(1, tmpV.length() - 1);
    Map<String, Map> payload;
    try {
        payload = (Map<String, Map>) ObjectBuilder.fromJSON(json); // the 'erased' type?
    } catch (Exception e) {
        throw new IOException("error, fromJson: ", e);
    }

    Set<Map.Entry<String, Map>> entries = payload.entrySet();
    for (Map.Entry<String, Map> entry : entries) {
        String cfq = entry.getKey(); // let's consider not joining family and qualifier at emitter.
        String[] parts = cfq.split(":");
        if (parts.length < 2)
            continue;
        String family = parts[0];
        String qualifier = parts[1];

        Map dict = entry.getValue(); // unchecked.

        // expecting dict to carry 'value',
        Object value = dict.get("value");
        if (value == null)
            continue; // no good.

        // ..and possibly 'timestamp'.
        //Object ts = 0;
        //if (dict.containsKey("timestamp"))
        //ts = dict.get("timestamp");

        put.add(family.getBytes("UTF-8"), qualifier.getBytes("UTF-8"), value.toString().getBytes("UTF-8"));
    }
}

From source file:hivemall.sketch.bloom.BloomFilterUtils.java

License:Apache License

@Nonnull
public static Text serialize(@Nonnull final Filter filter, @Nonnull final Text dst) throws IOException {
    FastByteArrayOutputStream bos = new FastByteArrayOutputStream();
    Base91OutputStream base91 = new Base91OutputStream(bos);
    DataOutputStream out = new DataOutputStream(base91);
    filter.write(out);/*from   w  w w .ja  va 2  s  .  co  m*/
    out.flush();
    base91.finish();
    dst.set(bos.getInternalArray(), 0, bos.size());
    return dst;
}

From source file:hivemall.utils.hadoop.JsonSerdeUtils.java

License:Apache License

private static void serializePrimitive(@Nonnull final StringBuilder sb, @Nullable final Object obj,
        @Nullable final PrimitiveObjectInspector poi) throws SerDeException {
    if (obj == null) {
        sb.append("null");
    } else {//from w w  w. j  ava 2 s .c o m
        switch (poi.getPrimitiveCategory()) {
        case BOOLEAN: {
            boolean b = ((BooleanObjectInspector) poi).get(obj);
            sb.append(b ? "true" : "false");
            break;
        }
        case BYTE: {
            sb.append(((ByteObjectInspector) poi).get(obj));
            break;
        }
        case SHORT: {
            sb.append(((ShortObjectInspector) poi).get(obj));
            break;
        }
        case INT: {
            sb.append(((IntObjectInspector) poi).get(obj));
            break;
        }
        case LONG: {
            sb.append(((LongObjectInspector) poi).get(obj));
            break;
        }
        case FLOAT: {
            sb.append(((FloatObjectInspector) poi).get(obj));
            break;
        }
        case DOUBLE: {
            sb.append(((DoubleObjectInspector) poi).get(obj));
            break;
        }
        case STRING: {
            String s = SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(obj));
            appendWithQuotes(sb, s);
            break;
        }
        case BINARY:
            byte[] b = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(obj);
            Text txt = new Text();
            txt.set(b, 0, b.length);
            appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString()));
            break;
        case DATE:
            Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(obj);
            appendWithQuotes(sb, d.toString());
            break;
        case TIMESTAMP: {
            Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(obj);
            appendWithQuotes(sb, t.toString());
            break;
        }
        case DECIMAL:
            sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(obj));
            break;
        case VARCHAR: {
            String s = SerDeUtils
                    .escapeString(((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(obj).toString());
            appendWithQuotes(sb, s);
            break;
        }
        case CHAR: {
            //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13)
            // HiveChar.toString() returns getPaddedValue()
            String s = SerDeUtils
                    .escapeString(((HiveCharObjectInspector) poi).getPrimitiveJavaObject(obj).toString());
            appendWithQuotes(sb, s);
            break;
        }
        default:
            throw new SerDeException("Unknown primitive type: " + poi.getPrimitiveCategory());
        }
    }
}

From source file:io.fluo.core.util.ByteUtil.java

License:Apache License

/**
 * Convert from Bytes to Hadoop Text object
 * //from www. j av a2 s .c  om
 * @param b Bytes
 * @return Text object
 */
public static Text toText(Bytes b) {
    if (b.isBackedByArray()) {
        Text t = new Text(EMPTY);
        t.set(b.getBackingArray(), b.offset(), b.length());
        return t;
    } else {
        return new Text(b.toArray());
    }
}

From source file:mapred.io.CustomRecordReader.java

License:Apache License

private int skipUtfByteOrderMark() throws IOException {
    Text value = new Text();
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;/* ww  w .  j av  a 2s .co  m*/
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
            && (textBytes[2] == (byte) 0xBF)) {
        // find UTF-8 BOM, strip it.
        LOG.info("Found UTF-8 BOM and skipped it");
        textLength -= 3;
        newSize -= 3;
        if (textLength > 0) {
            // It may work to use the same buffer and not do the copyBytes
            textBytes = value.copyBytes();
            value.set(textBytes, 3, textLength);
        } else {
            value.clear();
        }
    }
    return newSize;
}

From source file:mr.MyFileRecordReader2.java

License:Apache License

private int skipUtfByteOrderMark(Text value) throws IOException {
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;//from w  w  w . j  a  va2 s  .c  om
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
            && (textBytes[2] == (byte) 0xBF)) {
        // find UTF-8 BOM, strip it.
        LOG.info("Found UTF-8 BOM and skipped it");
        textLength -= 3;
        newSize -= 3;
        if (textLength > 0) {
            // It may work to use the same buffer and not do the copyBytes
            textBytes = value.copyBytes();
            value.set(textBytes, 3, textLength);
        } else {
            value.clear();
        }
    }
    return newSize;
}