List of usage examples for org.apache.hadoop.io Text set
public void set(byte[] utf8, int start, int len)
From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java
License:Open Source License
public static void consumeMap(Text text, Map<String, String> tags) { tags.clear();/*from ww w . j a v a 2s. c o m*/ if (text.getLength() > 0) { byte[] tagsBytes = text.getBytes(); if (tagsBytes[0] != Separators[MapStart]) return; int i1 = 1; while (i1 < text.getLength() && tagsBytes[i1] != Separators[MapEnd]) { int i2 = i1 + 1; while (i2 < text.getLength() && tagsBytes[i2] != Separators[KeyValueSeparator]) i2++; String key = new String(tagsBytes, i1, i2 - i1); i1 = i2 + 1; i2 = i1 + 1; while (i2 < text.getLength() && tagsBytes[i2] != Separators[FieldSeparator] && tagsBytes[i2] != Separators[MapEnd]) i2++; String value = new String(tagsBytes, i1, i2 - i1); tags.put(key, value); i1 = i2; if (i1 < text.getLength() && tagsBytes[i1] == Separators[FieldSeparator]) i1++; } if (i1 < text.getLength()) text.set(tagsBytes, i1, text.getLength() - i1); } }
From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java
License:Open Source License
public static OGCGeometry consumeGeometryESRI(Text text, char separator) { // Check whether this text is a Well Known Text (WKT) or a hexed string boolean wkt = false; byte[] bytes = text.getBytes(); int length = text.getLength(); int i_shape = 0; while (!wkt && i_shape < ShapeNames.length) { byte[] shapeName = ShapeNames[i_shape]; if (length > shapeName.length) { int i = 0; while (i < shapeName.length && shapeName[i] == bytes[i]) i++;/*from ww w . ja v a 2 s .c om*/ if (i == shapeName.length) { wkt = true; break; } } i_shape++; } // Look for the terminator of the shape text int i1 = 0; if (bytes[i1] == '\'' || bytes[i1] == '\"') { separator = (char) bytes[i1++]; } int i2 = i1; while (i2 < length && bytes[i2] != separator) i2++; String str = new String(bytes, i1, i2 - i1); // Remove consumed bytes from the text text.set(bytes, i2, text.getLength() - i2); OGCGeometry geom = parseText(str); return geom; }
From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java
License:Open Source License
public static synchronized Geometry consumeGeometryJTS(Text text, char separator) { // Check whether this text is a Well Known Text (WKT) or a hexed string boolean wkt = false; byte[] bytes = text.getBytes(); int length = text.getLength(); Geometry geom;//ww w . ja va2 s. c om int i1, i2; // Start and end offset of the geometry being parsed int i_next; // Beginning of the next field boolean isWKT = false; boolean isHex = false; if (bytes[0] == '\'' || bytes[0] == '\"') { // A quoted string. Find terminating quote and trim the quotes i1 = 1; i2 = 2; while (i2 < length && bytes[i2] != bytes[0]) i2++; if (i2 == length) throw new RuntimeException("Unterminated quoted string"); i_next = i2 + 1; i2--; // Back one step to remove the terminating quote isWKT = true; // Assume any quoted string to be WKT } else { // Not a quoted string, check if the type is WKT int i_shape = 0; while (!wkt && i_shape < ShapeNames.length) { byte[] shapeName = ShapeNames[i_shape]; if (length > shapeName.length) { int i = 0; while (i < shapeName.length && shapeName[i] == bytes[i]) i++; if (i == shapeName.length) { wkt = true; break; } } i_shape++; } if (i_shape < ShapeNames.length) { isWKT = true; // Look for the terminator of the shape text i1 = 0; i2 = 1; // Search for the first open parenthesis while (i2 < length && bytes[i2] != '(') i2++; if (i2 < length) i2++; // Skip the open parenthesis itself int nesting = 1; while (i2 < length && nesting > 0) { if (bytes[i2] == '(') nesting++; else if (bytes[i2] == ')') nesting--; i2++; } i_next = i2 + 1; } else { // Check if the type is hex-encoded WKB i1 = 0; i2 = 0; while (i2 < length && IsHex[bytes[i2]]) i2++; isHex = i2 > 1; i_next = i2; } } String geom_text = new String(bytes, i1, i2); try { if (isWKT) { geom = wktReader.read(geom_text); } else if (isHex) { byte[] binary = hexToBytes(geom_text); geom = wkbReader.read(binary); } else { geom = null; } } catch (ParseException e) { throw new RuntimeException(String.format("Error parsing '%s'", geom_text), e); } // Remove consumed bytes from the text if (i_next >= text.getLength()) text.clear(); else { if (bytes[i_next] == separator) i_next++; text.set(bytes, i_next, length - i_next); } return geom; }
From source file:edu.umn.cs.spatialHadoop.nasa.NASAPoint.java
License:Open Source License
@Override public void fromText(Text text) { super.fromText(text); byte[] bytes = text.getBytes(); text.set(bytes, 1, text.getLength() - 1); value = TextSerializerHelper.consumeInt(text, ','); timestamp = TextSerializerHelper.consumeLong(text, '\0'); }
From source file:fm.last.darling.hbase.HBaseJSONOutputReader.java
License:Apache License
private void interpretKeyandValue(byte[] line, int length) throws IOException { // Need to find numKeyFields separators int pos = UTF8ByteArrayUtils.findBytes(line, 0, length, separator); for (int k = 1; k < numKeyFields && pos != -1; k++) { pos = UTF8ByteArrayUtils.findBytes(line, pos + separator.length, length, separator); }//from w w w . j ava 2s .com Text k = new Text(); Text v = new Text(); try { if (pos == -1) { k.set(line, 0, length); v.set(""); } else { StreamKeyValUtil.splitKeyVal(line, 0, length, k, v, pos, separator.length); } } catch (CharacterCodingException e) { throw new IOException(e); } // removing a ' at the start and end of the key byte[] keyBytes = trimOuterBytes(k); rowkey = new ImmutableBytesWritable(keyBytes); put = new Put(keyBytes); String tmpV = v.toString(); String json = tmpV.substring(1, tmpV.length() - 1); Map<String, Map> payload; try { payload = (Map<String, Map>) ObjectBuilder.fromJSON(json); // the 'erased' type? } catch (Exception e) { throw new IOException("error, fromJson: ", e); } Set<Map.Entry<String, Map>> entries = payload.entrySet(); for (Map.Entry<String, Map> entry : entries) { String cfq = entry.getKey(); // let's consider not joining family and qualifier at emitter. String[] parts = cfq.split(":"); if (parts.length < 2) continue; String family = parts[0]; String qualifier = parts[1]; Map dict = entry.getValue(); // unchecked. // expecting dict to carry 'value', Object value = dict.get("value"); if (value == null) continue; // no good. // ..and possibly 'timestamp'. //Object ts = 0; //if (dict.containsKey("timestamp")) //ts = dict.get("timestamp"); put.add(family.getBytes("UTF-8"), qualifier.getBytes("UTF-8"), value.toString().getBytes("UTF-8")); } }
From source file:hivemall.sketch.bloom.BloomFilterUtils.java
License:Apache License
@Nonnull public static Text serialize(@Nonnull final Filter filter, @Nonnull final Text dst) throws IOException { FastByteArrayOutputStream bos = new FastByteArrayOutputStream(); Base91OutputStream base91 = new Base91OutputStream(bos); DataOutputStream out = new DataOutputStream(base91); filter.write(out);/*from w w w .ja va 2 s . co m*/ out.flush(); base91.finish(); dst.set(bos.getInternalArray(), 0, bos.size()); return dst; }
From source file:hivemall.utils.hadoop.JsonSerdeUtils.java
License:Apache License
private static void serializePrimitive(@Nonnull final StringBuilder sb, @Nullable final Object obj, @Nullable final PrimitiveObjectInspector poi) throws SerDeException { if (obj == null) { sb.append("null"); } else {//from w w w. j ava 2 s .c o m switch (poi.getPrimitiveCategory()) { case BOOLEAN: { boolean b = ((BooleanObjectInspector) poi).get(obj); sb.append(b ? "true" : "false"); break; } case BYTE: { sb.append(((ByteObjectInspector) poi).get(obj)); break; } case SHORT: { sb.append(((ShortObjectInspector) poi).get(obj)); break; } case INT: { sb.append(((IntObjectInspector) poi).get(obj)); break; } case LONG: { sb.append(((LongObjectInspector) poi).get(obj)); break; } case FLOAT: { sb.append(((FloatObjectInspector) poi).get(obj)); break; } case DOUBLE: { sb.append(((DoubleObjectInspector) poi).get(obj)); break; } case STRING: { String s = SerDeUtils.escapeString(((StringObjectInspector) poi).getPrimitiveJavaObject(obj)); appendWithQuotes(sb, s); break; } case BINARY: byte[] b = ((BinaryObjectInspector) poi).getPrimitiveJavaObject(obj); Text txt = new Text(); txt.set(b, 0, b.length); appendWithQuotes(sb, SerDeUtils.escapeString(txt.toString())); break; case DATE: Date d = ((DateObjectInspector) poi).getPrimitiveJavaObject(obj); appendWithQuotes(sb, d.toString()); break; case TIMESTAMP: { Timestamp t = ((TimestampObjectInspector) poi).getPrimitiveJavaObject(obj); appendWithQuotes(sb, t.toString()); break; } case DECIMAL: sb.append(((HiveDecimalObjectInspector) poi).getPrimitiveJavaObject(obj)); break; case VARCHAR: { String s = SerDeUtils .escapeString(((HiveVarcharObjectInspector) poi).getPrimitiveJavaObject(obj).toString()); appendWithQuotes(sb, s); break; } case CHAR: { //this should use HiveChar.getPaddedValue() but it's protected; currently (v0.13) // HiveChar.toString() returns getPaddedValue() String s = SerDeUtils .escapeString(((HiveCharObjectInspector) poi).getPrimitiveJavaObject(obj).toString()); appendWithQuotes(sb, s); break; } default: throw new SerDeException("Unknown primitive type: " + poi.getPrimitiveCategory()); } } }
From source file:io.fluo.core.util.ByteUtil.java
License:Apache License
/** * Convert from Bytes to Hadoop Text object * //from www. j av a2 s .c om * @param b Bytes * @return Text object */ public static Text toText(Bytes b) { if (b.isBackedByArray()) { Text t = new Text(EMPTY); t.set(b.getBackingArray(), b.offset(), b.length()); return t; } else { return new Text(b.toArray()); } }
From source file:mapred.io.CustomRecordReader.java
License:Apache License
private int skipUtfByteOrderMark() throws IOException { Text value = new Text(); // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE); int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos)); // Even we read 3 extra bytes for the first line, // we won't alter existing behavior (no backwards incompat issue). // Because the newSize is less than maxLineLength and // the number of bytes copied to Text is always no more than newSize. // If the return size from readLine is not less than maxLineLength, // we will discard the current line and read the next line. pos += newSize;/* ww w . j av a 2s .co m*/ int textLength = value.getLength(); byte[] textBytes = value.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and not do the copyBytes textBytes = value.copyBytes(); value.set(textBytes, 3, textLength); } else { value.clear(); } } return newSize; }
From source file:mr.MyFileRecordReader2.java
License:Apache License
private int skipUtfByteOrderMark(Text value) throws IOException { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength, Integer.MAX_VALUE); int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos)); // Even we read 3 extra bytes for the first line, // we won't alter existing behavior (no backwards incompat issue). // Because the newSize is less than maxLineLength and // the number of bytes copied to Text is always no more than newSize. // If the return size from readLine is not less than maxLineLength, // we will discard the current line and read the next line. pos += newSize;//from w w w . j a va2 s .c om int textLength = value.getLength(); byte[] textBytes = value.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and not do the copyBytes textBytes = value.copyBytes(); value.set(textBytes, 3, textLength); } else { value.clear(); } } return newSize; }