List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:edu.isi.mavuno.extract.DIRTExtractor.java
License:Apache License
private List<ContextPatternWritable> getContext(ArrayListOfInts leftPath, ArrayListOfInts rightPath, List<TratzParsedTokenWritable> tokens, Text[] chunks) { //, int leftContextSize, int rightContextSize) { // construct (context, pattern) pairs List<ContextPatternWritable> contexts = new ArrayList<ContextPatternWritable>(); // make sure that the dimensions are feasible if (leftPath.size() < 1 || rightPath.size() < 1) { return contexts; }// w w w.java2 s . co m // make sure we don't split the left context's chunk Text leftChunk = chunks[leftPath.get(0) - 1]; for (int i = 1; i <= leftPath.size() - 1; i++) { if (chunks[leftPath.get(i) - 1].equals(leftChunk)) { return contexts; } } // make sure we don't split the right context's chunk Text rightChunk = chunks[rightPath.get(0) - 1]; for (int i = rightPath.size() - 1; i >= 1; i--) { if (chunks[rightPath.get(i) - 1].equals(rightChunk)) { return contexts; } } TratzParsedTokenWritable t; Text term, posTag, dependType; // construct pattern based on the parse tree path final Text pattern = new Text(); // encode left context chunk type // TODO: replace this with a more robust way of checking if this is an actual named entity or not Text leftNETag = tokens.get(leftPath.get(0) - 1).getNETag(); Text leftChunkTag = tokens.get(leftPath.get(0) - 1).getChunkTag(); if (leftNETag.getLength() != 1 || (leftNETag.getLength() > 0 && leftNETag.getBytes()[0] != 'O')) { pattern.append(leftNETag.getBytes(), 0, leftNETag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } else { if (leftChunkTag.getLength() > 2) { pattern.append(leftChunkTag.getBytes(), 2, leftChunkTag.getLength() - 2); } else { pattern.append(leftChunkTag.getBytes(), 0, leftChunkTag.getLength()); } pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // left path portion of pattern if (!mHeadOnly) { for (int i = 0; i <= leftPath.size() - 2; i++) { t = tokens.get(leftPath.get(i) - 1); term = mUseLemmas ? t.getLemma() : t.getToken(); dependType = t.getDependType(); posTag = t.getPosTag(); if (i != 0) { pattern.append(term.getBytes(), 0, term.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } pattern.append(dependType.getBytes(), 0, dependType.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } } else { dependType = tokens.get(leftPath.get(0) - 1).getDependType(); posTag = tokens.get(leftPath.get(0) - 1).getPosTag(); pattern.append(dependType.getBytes(), 0, dependType.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // root portion of pattern if (leftPath.get(leftPath.size() - 1) != rightPath.get(rightPath.size() - 1)) { throw new RuntimeException( "Left and right paths do not share the same root! -- " + leftPath + " <--> " + rightPath); } t = tokens.get(leftPath.get(leftPath.size() - 1) - 1); term = mUseLemmas ? t.getLemma() : t.getToken(); dependType = t.getDependType(); posTag = t.getPosTag(); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(term.getBytes(), 0, term.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); // right path portion of pattern if (!mHeadOnly) { for (int i = rightPath.size() - 2; i >= 0; i--) { t = tokens.get(rightPath.get(i) - 1); term = mUseLemmas ? t.getLemma() : t.getToken(); dependType = t.getDependType(); posTag = t.getPosTag(); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(dependType.getBytes(), 0, dependType.getLength()); if (i != 0) { pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(posTag.getBytes(), 0, posTag.getLength() > 2 ? 2 : posTag.getLength()); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(term.getBytes(), 0, term.getLength()); } } } else { dependType = tokens.get(rightPath.get(0) - 1).getDependType(); posTag = tokens.get(rightPath.get(0) - 1).getPosTag(); pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(dependType.getBytes(), 0, dependType.getLength()); } // encode right context chunk type // TODO: replace this with a more robust way of checking if this is an actual named entity or not Text rightNETag = tokens.get(rightPath.get(0) - 1).getNETag(); Text rightChunkTag = tokens.get(rightPath.get(0) - 1).getChunkTag(); if (rightNETag.getLength() != 1 || (rightNETag.getLength() > 0 && rightNETag.getBytes()[0] != 'O')) { pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); pattern.append(rightNETag.getBytes(), 0, rightNETag.getLength()); } else { pattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (rightChunkTag.getLength() > 2) { pattern.append(rightChunkTag.getBytes(), 2, rightChunkTag.getLength() - 2); } else { pattern.append(rightChunkTag.getBytes(), 0, rightChunkTag.getLength()); } } if (mOrContextStyle) { if (!mRightOnlyContextStyle) { ContextPatternWritable c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(leftChunk, MavunoUtils.ASTERISK)); c.setPattern(pattern); contexts.add(c); } if (!mLeftOnlyContextStyle) { ContextPatternWritable c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(MavunoUtils.ASTERISK, rightChunk)); c.setPattern(pattern); contexts.add(c); } } else { ContextPatternWritable c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(leftChunk, rightChunk)); c.setPattern(pattern); contexts.add(c); } return contexts; }
From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();/*from w ww. j a v a 2s . c om*/ // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // get chunk ids List<TratzParsedTokenWritable> sentenceTokens = sentence.getTokens(); int[] chunkIds = NLProcTools.getChunkIds(sentenceTokens); mChunks.clear(); mChunkTokens.clear(); // extract chunks from sentence for (int i = 0; i < chunkIds.length; i++) { if (i > 0 && chunkIds[i] != chunkIds[i - 1]) { Chunk chunk = createChunk(mChunkTokens); mChunks.add(chunk); mChunkTokens.clear(); } mChunkTokens.add(sentenceTokens.get(i)); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { Chunk chunk = createChunk(mChunkTokens); mChunks.add(chunk); } // there's nothing we can do if there aren't at least mArity chunks in the sentence if (mArity > mChunks.size()) { mChunkPairsIter = mChunkPairs.iterator(); return; } // initialize context positions for (int i = 0; i < mArity; i++) { mContextPositions[i] = i; } // initialize pattern positions for (int i = 0; i < mArity - 1; i++) { mPatternPositions[i] = i; } // generate (context, pattern) pairs based on chunks final Text basePattern = new Text(); while (true) { // construct context for (int i = 0; i < mArity; i++) { mContextChunks[i] = mChunks.get(mContextPositions[i]); } // construct pattern for (int i = 0; i < mArity - 1; i++) { mPatternChunks[i] = mChunks.get(mPatternPositions[i]); } // add (context, pattern) pair basePattern.clear(); for (int i = 0; i < mArity - 1; i++) { // left chunk type basePattern.append(mContextChunks[i].type.getBytes(), 0, mContextChunks[i].type.getLength()); basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (mContextPositions[i + 1] - mPatternPositions[i] > 1 || mPatternPositions[i] - mContextPositions[i] > 1) { if (mPatternPositions[i] == mContextPositions[i]) { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } else if (mPatternPositions[i] == mContextPositions[i] + 1) { basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } else if (mPatternPositions[i] + 1 == mContextPositions[i + 1]) { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); } else { basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); basePattern.append(MavunoUtils.ASTERISK_BYTES, 0, MavunoUtils.ASTERISK_BYTES_LENGTH); } } else if (mPatternPositions[i] == mContextPositions[i]) { basePattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); } else { basePattern.append(mPatternChunks[i].text.getBytes(), 0, mPatternChunks[i].text.getLength()); } basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } // last chunk type basePattern.append(mContextChunks[mArity - 1].type.getBytes(), 0, mContextChunks[mArity - 1].type.getLength()); basePattern.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); int[] indices; mPermGen.reset(); while (mPermGen.hasMore()) { // get next permutation indices = mPermGen.getNext(); ContextPatternWritable c = new ContextPatternWritable(); // pattern c.setPattern(basePattern); Text numLeftText = new Text(mPermGen.getNumLeft() + "/" + mArity); c.getPattern().append(numLeftText.getBytes(), 0, numLeftText.getLength()); // context c.getContext().clear(); for (int i = 0; i < mArity; i++) { c.getContext().append(mContextChunks[indices[i]].text.getBytes(), 0, mContextChunks[indices[i]].text.getLength()); if (i != mArity - 1) { c.getContext().append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); } } // add to chunk pairs mChunkPairs.add(c); } // get next set of context and pattern positions int pos = mArity - 2; while (pos >= 0) { if (mPatternPositions[pos] + 1 < mChunks.size() && mPatternPositions[pos] + 1 < mContextPositions[pos + 1]) { mPatternPositions[pos]++; for (int i = pos + 1; i < mArity - 2; i++) { mPatternPositions[i] = mContextPositions[i]; } break; } pos--; } // update the context positions if the pattern positions can't be updated any further if (pos < 0) { pos = mArity - 1; while (pos >= 0) { if (mContextPositions[pos] + 1 < mChunks.size() && (pos + 1 >= mArity || mContextPositions[pos + 1] - (mContextPositions[pos] + 1) >= 1) && (pos <= 0 || mContextPositions[pos] - mContextPositions[pos - 1] - 1 <= mMaxSkipSize)) { mContextPositions[pos]++; if (pos < mArity - 1) { mPatternPositions[pos] = mContextPositions[pos]; } for (int i = pos + 1; i < mArity; i++) { mContextPositions[i] = mContextPositions[pos] + (i - pos); if (i < mArity - 1) { mPatternPositions[i] = mContextPositions[i]; } } break; } pos--; } // if neither the context nor the pattern positions can be updated then we're done if (pos < 0) { // get iterator mChunkPairsIter = mChunkPairs.iterator(); return; } } } }
From source file:edu.isi.mavuno.extract.NAryChunkExtractor.java
License:Apache License
private Chunk createChunk(List<TratzParsedTokenWritable> terms) { Chunk chunk = new Chunk(); // construct the chunk text and detect the chunk type Text chunkNEType = null;/*from ww w .j a v a 2 s. com*/ for (int i = 0; i < terms.size(); i++) { Text term = terms.get(i).getToken(); Text neTag = terms.get(i).getNETag(); chunk.text.append(term.getBytes(), 0, term.getLength()); if (i != terms.size() - 1) { chunk.text.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); } // TODO: replace this with a more robust way of checking if this is an actual named entity or not if (neTag.getLength() != 1 || (neTag.getLength() > 0 && neTag.getBytes()[0] != 'O')) { chunkNEType = neTag; } } // set the chunk type (note that this is somewhat heuristic) if (chunkNEType != null) { // chunk type = named entity type, if present chunk.type.set(chunkNEType); } else { // otherwise, chunk type = chunk tag Text chunkTag = terms.get(0).getChunkTag(); if (chunkTag.getLength() > 2) { chunk.type.set(chunkTag.getBytes(), 2, chunkTag.getLength() - 2); } else { chunk.type.set(chunkTag); } } return chunk; }
From source file:edu.isi.mavuno.nlp.NLProcTools.java
License:Apache License
private static Text extractGeneralizedChunk(int chunkStart, int chunkEnd, List<TratzParsedTokenWritable> tokens, boolean appendPOSTag) { Text chunk = new Text(); int startPos; for (startPos = chunkEnd; startPos >= chunkStart; startPos--) { if (!tokens.get(startPos).getPosTag().toString().startsWith("NN")) { break; }//from w ww . j ava 2 s . co m } if (startPos != chunkStart - 1 && startPos != chunkEnd) { Text generalChunk = extractMainChunk(startPos + 1, chunkEnd + 1, tokens, appendPOSTag); if (generalChunk != null) { chunk.set(generalChunk); } } // return null if no valid terms are found if (chunk.getLength() == 0) { return null; } return chunk; }
From source file:edu.mit.ll.graphulo.pig.backend.GraphuloOneTableStorage.java
License:Apache License
@Override protected Tuple getTuple(Key key, Value value) throws IOException { SortedMap<Key, Value> rowKVs = WholeRowIterator.decodeRow(key, value); Tuple tuple = TupleFactory.getInstance().newTuple(columns.size() + 1); final Text cfHolder = new Text(); final Text cqHolder = new Text(); final Text row = key.getRow(); int tupleOffset = 0; tuple.set(tupleOffset, new DataByteArray(Text.decode(row.getBytes(), 0, row.getLength()))); for (Column column : this.columns) { tupleOffset++;/* w w w. ja v a2s . c om*/ switch (column.getType()) { case LITERAL: cfHolder.set(column.getColumnFamily()); if (null != column.getColumnQualifier()) { cqHolder.set(column.getColumnQualifier()); } else { cqHolder.set(EMPTY_TEXT); } // Get the key where our literal would exist (accounting for // "colf:colq" or "colf:" empty colq) Key literalStartKey = new Key(row, cfHolder, cqHolder); SortedMap<Key, Value> tailMap = rowKVs.tailMap(literalStartKey); // Find the element if (tailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { Key actualKey = tailMap.firstKey(); // Only place it in the tuple if it matches the user // request, avoid using a value from a // key with the wrong colqual if (0 == literalStartKey.compareTo(actualKey, PartialKey.ROW_COLFAM_COLQUAL)) { tuple.set(tupleOffset, new DataByteArray(tailMap.get(actualKey).get())); } else { // This row doesn't have the column we were looking for tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } } break; case COLFAM_PREFIX: cfHolder.set(column.getColumnFamily()); Range colfamPrefixRange = Range.prefix(row, cfHolder); Key colfamPrefixStartKey = new Key(row, cfHolder); SortedMap<Key, Value> cfTailMap = rowKVs.tailMap(colfamPrefixStartKey); // Find the element if (cfTailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>(); // Build up a map for all the entries in this row that match // the colfam prefix for (Entry<Key, Value> entry : cfTailMap.entrySet()) { if (colfamPrefixRange.contains(entry.getKey())) { entry.getKey().getColumnFamily(cfHolder); entry.getKey().getColumnQualifier(cqHolder); DataByteArray val = new DataByteArray(entry.getValue().get()); // Avoid adding an extra ':' when colqual is empty if (0 == cqHolder.getLength()) { tupleMap.put(cfHolder.toString(), val); } else { tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val); } } else { break; } } if (!tupleMap.isEmpty()) { tuple.set(tupleOffset, tupleMap); } } break; case COLQUAL_PREFIX: cfHolder.set(column.getColumnFamily()); cqHolder.set(column.getColumnQualifier()); Range colqualPrefixRange = Range.prefix(row, cfHolder, cqHolder); Key colqualPrefixStartKey = new Key(row, cfHolder, cqHolder); SortedMap<Key, Value> cqTailMap = rowKVs.tailMap(colqualPrefixStartKey); if (cqTailMap.isEmpty()) { tuple.set(tupleOffset, EMPTY_DATA_BYTE_ARRAY); } else { HashMap<String, DataByteArray> tupleMap = new HashMap<String, DataByteArray>(); // Build up a map for all the entries in this row that match // the colqual prefix for (Entry<Key, Value> entry : cqTailMap.entrySet()) { if (colqualPrefixRange.contains(entry.getKey())) { entry.getKey().getColumnFamily(cfHolder); entry.getKey().getColumnQualifier(cqHolder); DataByteArray val = new DataByteArray(entry.getValue().get()); // Avoid the extra ':' on empty colqual if (0 == cqHolder.getLength()) { tupleMap.put(cfHolder.toString(), val); } else { tupleMap.put(cfHolder.toString() + COLON + cqHolder.toString(), val); } } else { break; } } if (!tupleMap.isEmpty()) { tuple.set(tupleOffset, tupleMap); } } break; default: break; } } return tuple; }
From source file:edu.uci.ics.hivesterix.serde.lazy.LazySerDe.java
License:Apache License
/** * Deserialize a table record to a Lazy struct. */// w ww . j ava2s . c om @SuppressWarnings("deprecation") @Override public Object deserialize(Writable field) throws SerDeException { if (byteArrayRef == null) { byteArrayRef = new ByteArrayRef(); } if (field instanceof BytesWritable) { BytesWritable b = (BytesWritable) field; if (b.getSize() == 0) { return null; } // For backward-compatibility with hadoop 0.17 byteArrayRef.setData(b.get()); cachedLazyStruct.init(byteArrayRef.getData(), 0, b.getSize()); } else if (field instanceof Text) { Text t = (Text) field; if (t.getLength() == 0) { return null; } byteArrayRef.setData(t.getBytes()); cachedLazyStruct.init(byteArrayRef.getData(), 0, t.getLength()); } else { throw new SerDeException(getClass().toString() + ": expects either BytesWritable or Text object!"); } return cachedLazyStruct; }
From source file:edu.uci.ics.hivesterix.serde.lazy.LazySerDe.java
License:Apache License
/** * A recursive function that serialize an object to a byte buffer based on * its object inspector.//from ww w .j a v a 2s.co m * * @param byteStream * the byte stream storing the serialization data * @param obj * the object to serialize * @param objInspector * the object inspector */ private void serialize(Output byteStream, Object obj, ObjectInspector objInspector) { // do nothing for null object if (null == obj) { return; } switch (objInspector.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) objInspector; switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BOOLEAN: { boolean v = ((BooleanObjectInspector) poi).get(obj); byteStream.write((byte) (v ? 1 : 0)); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; byte v = boi.get(obj); byteStream.write(v); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; short v = spoi.get(obj); byteStream.write((byte) (v >> 8)); byteStream.write((byte) (v)); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; int v = ioi.get(obj); LazyUtils.writeVInt(byteStream, v); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; long v = loi.get(obj); LazyUtils.writeVLong(byteStream, v); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; int v = Float.floatToIntBits(foi.get(obj)); byteStream.write((byte) (v >> 24)); byteStream.write((byte) (v >> 16)); byteStream.write((byte) (v >> 8)); byteStream.write((byte) (v)); return; } case DOUBLE: { DoubleObjectInspector doi = (DoubleObjectInspector) poi; long v = Double.doubleToLongBits(doi.get(obj)); byteStream.write((byte) (v >> 56)); byteStream.write((byte) (v >> 48)); byteStream.write((byte) (v >> 40)); byteStream.write((byte) (v >> 32)); byteStream.write((byte) (v >> 24)); byteStream.write((byte) (v >> 16)); byteStream.write((byte) (v >> 8)); byteStream.write((byte) (v)); return; } case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(obj); /* write byte size of the string which is a vint */ int length = t.getLength(); LazyUtils.writeVInt(byteStream, length); /* write string itself */ byte[] data = t.getBytes(); byteStream.write(data, 0, length); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) objInspector; ObjectInspector eoi = loi.getListElementObjectInspector(); // 1/ reserve spaces for the byte size of the list // which is a integer and takes four bytes int byteSizeStart = byteStream.getCount(); byteStream.write((byte) 0); byteStream.write((byte) 0); byteStream.write((byte) 0); byteStream.write((byte) 0); int listStart = byteStream.getCount(); // 2/ write the size of the list as a VInt int size = loi.getListLength(obj); LazyUtils.writeVInt(byteStream, size); // 3/ write the null bytes byte nullByte = 0; for (int eid = 0; eid < size; eid++) { // set the bit to 1 if an element is not null if (null != loi.getListElement(obj, eid)) { nullByte |= 1 << (eid % 8); } // store the byte every eight elements or // if this is the last element if (7 == eid % 8 || eid == size - 1) { byteStream.write(nullByte); nullByte = 0; } } // 4/ write element by element from the list for (int eid = 0; eid < size; eid++) { serialize(byteStream, loi.getListElement(obj, eid), eoi); } // 5/ update the list byte size int listEnd = byteStream.getCount(); int listSize = listEnd - listStart; byte[] bytes = byteStream.getData(); bytes[byteSizeStart] = (byte) (listSize >> 24); bytes[byteSizeStart + 1] = (byte) (listSize >> 16); bytes[byteSizeStart + 2] = (byte) (listSize >> 8); bytes[byteSizeStart + 3] = (byte) (listSize); return; } case MAP: { MapObjectInspector moi = (MapObjectInspector) objInspector; ObjectInspector koi = moi.getMapKeyObjectInspector(); ObjectInspector voi = moi.getMapValueObjectInspector(); Map<?, ?> map = moi.getMap(obj); // 1/ reserve spaces for the byte size of the map // which is a integer and takes four bytes int byteSizeStart = byteStream.getCount(); byteStream.write((byte) 0); byteStream.write((byte) 0); byteStream.write((byte) 0); byteStream.write((byte) 0); int mapStart = byteStream.getCount(); // 2/ write the size of the map which is a VInt int size = map.size(); LazyUtils.writeVInt(byteStream, size); // 3/ write the null bytes int b = 0; byte nullByte = 0; for (Map.Entry<?, ?> entry : map.entrySet()) { // set the bit to 1 if a key is not null if (null != entry.getKey()) { nullByte |= 1 << (b % 8); } else if (!nullMapKey) { nullMapKey = true; LOG.warn("Null map key encountered! Ignoring similar problems."); } b++; // set the bit to 1 if a value is not null if (null != entry.getValue()) { nullByte |= 1 << (b % 8); } b++; // write the byte to stream every 4 key-value pairs // or if this is the last key-value pair if (0 == b % 8 || b == size * 2) { byteStream.write(nullByte); nullByte = 0; } } // 4/ write key-value pairs one by one for (Map.Entry<?, ?> entry : map.entrySet()) { serialize(byteStream, entry.getKey(), koi); serialize(byteStream, entry.getValue(), voi); } // 5/ update the byte size of the map int mapEnd = byteStream.getCount(); int mapSize = mapEnd - mapStart; byte[] bytes = byteStream.getData(); bytes[byteSizeStart] = (byte) (mapSize >> 24); bytes[byteSizeStart + 1] = (byte) (mapSize >> 16); bytes[byteSizeStart + 2] = (byte) (mapSize >> 8); bytes[byteSizeStart + 3] = (byte) (mapSize); return; } case STRUCT: { // 1/ reserve spaces for the byte size of the struct // which is a integer and takes four bytes int byteSizeStart = byteStream.getCount(); byteStream.write((byte) 0); byteStream.write((byte) 0); byteStream.write((byte) 0); byteStream.write((byte) 0); int structStart = byteStream.getCount(); // 2/ serialize the struct serializeStruct(byteStream, obj, (StructObjectInspector) objInspector); // 3/ update the byte size of the struct int structEnd = byteStream.getCount(); int structSize = structEnd - structStart; byte[] bytes = byteStream.getData(); bytes[byteSizeStart] = (byte) (structSize >> 24); bytes[byteSizeStart + 1] = (byte) (structSize >> 16); bytes[byteSizeStart + 2] = (byte) (structSize >> 8); bytes[byteSizeStart + 3] = (byte) (structSize); return; } default: { throw new RuntimeException("Unrecognized type: " + objInspector.getCategory()); } } }
From source file:edu.uci.ics.hyracks.hdfs.lib.TextKeyValueParserFactory.java
License:Apache License
@Override public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) throws HyracksDataException { final ArrayTupleBuilder tb = new ArrayTupleBuilder(1); final ByteBuffer buffer = ctx.allocateFrame(); final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize()); appender.reset(buffer, true);// www .j ava 2 s . c o m return new IKeyValueParser<LongWritable, Text>() { @Override public void open(IFrameWriter writer) { } @Override public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString) throws HyracksDataException { tb.reset(); tb.addField(value.getBytes(), 0, value.getLength()); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { FrameUtils.flushFrame(buffer, writer); appender.reset(buffer, true); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new HyracksDataException("tuple cannot be appended into the frame"); } } } @Override public void close(IFrameWriter writer) throws HyracksDataException { FrameUtils.flushFrame(buffer, writer); } }; }
From source file:edu.uci.ics.hyracks.imru.dataflow.Hdtest.java
License:Apache License
public static JobSpecification createJob() throws Exception { JobSpecification spec = new JobSpecification(); spec.setFrameSize(4096);//from w w w . j a v a 2 s . com String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf"; String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0"; JobConf conf = new JobConf(); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml")); FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH); conf.setInputFormat(TextInputFormat.class); RecordDescriptor recordDesc = new RecordDescriptor( new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() }); InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1); HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits, new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() { @Override public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) { return new IKeyValueParser<LongWritable, Text>() { TupleWriter tupleWriter; @Override public void open(IFrameWriter writer) throws HyracksDataException { tupleWriter = new TupleWriter(ctx, writer, 1); } @Override public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString) throws HyracksDataException { try { tupleWriter.write(value.getBytes(), 0, value.getLength()); tupleWriter.finishField(); tupleWriter.finishTuple(); } catch (IOException e) { throw new HyracksDataException(e); } } @Override public void close(IFrameWriter writer) throws HyracksDataException { tupleWriter.close(); } }; } }); // createPartitionConstraint(spec, readOperator, new String[] {"NC0"}); PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" }); IOperatorDescriptor writer = new HDFSOD(spec, null, null, null); // createPartitionConstraint(spec, writer, outSplits); spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0); spec.addRoot(writer); return spec; }
From source file:edu.uci.ics.pregelix.example.aggregator.OverflowAggregator.java
License:Apache License
@Override public void step(Text partialResult) { textLength += partialResult.getLength(); }