List of usage examples for org.apache.hadoop.io Text bytesToCodePoint
public static int bytesToCodePoint(ByteBuffer bytes)
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength()); int cp;//from w ww . j a v a 2 s . c om while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) { System.out.println(Integer.toHexString(cp)); } }
From source file:io.aos.hdfs.TextIterator.java
License:Apache License
public static void main(String... args) { Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00"); ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength()); int cp;//from w w w . j a va2 s . c o m while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) { System.out.println(Integer.toHexString(cp)); } }
From source file:org.apache.orc.impl.mask.RedactMaskFactory.java
License:Apache License
/** * Get the next code point from the ByteBuffer. Moves the position in the * ByteBuffer forward to the next code point. * @param param the source of bytes/*from www.j av a2s . c o m*/ * @param defaultValue if there are no bytes left, use this value * @return the code point that was found at the front of the buffer. */ static int getNextCodepoint(ByteBuffer param, int defaultValue) { if (param.remaining() == 0) { return defaultValue; } else { return Text.bytesToCodePoint(param); } }
From source file:org.apache.orc.impl.mask.RedactMaskFactory.java
License:Apache License
/** * Mask a string by finding the character category of each character * and replacing it with the matching literal. * @param source the source column vector * @param row the value index/* w ww. j a v a2 s. c o m*/ * @param target the target column vector */ void maskString(BytesColumnVector source, int row, BytesColumnVector target) { int expectedBytes = source.length[row]; ByteBuffer sourceBytes = ByteBuffer.wrap(source.vector[row], source.start[row], source.length[row]); // ensure we have enough space, if the masked data is the same size target.ensureValPreallocated(expectedBytes); byte[] outputBuffer = target.getValPreallocatedBytes(); int outputOffset = target.getValPreallocatedStart(); int outputStart = outputOffset; int index = 0; while (sourceBytes.remaining() > 0) { int cp = Text.bytesToCodePoint(sourceBytes); // Find the replacement for the current character. int replacement = getReplacement(cp); if (replacement == UNMASKED_CHAR || isIndexInUnmaskRange(index, source.length[row])) { replacement = cp; } // increment index index++; int len = getCodepointLength(replacement); // If the translation will overflow the buffer, we need to resize. // This will only happen when the masked size is larger than the original. if (len + outputOffset > outputBuffer.length) { // Revise estimate how much we are going to need now. We are maximally // pesamistic here so that we don't have to expand again for this value. int currentOutputStart = outputStart; int currentOutputLength = outputOffset - currentOutputStart; expectedBytes = currentOutputLength + len + sourceBytes.remaining() * 4; // Expand the buffer to fit the new estimate target.ensureValPreallocated(expectedBytes); // Copy over the bytes we've already written for this value and move // the pointers to the new output buffer. byte[] oldBuffer = outputBuffer; outputBuffer = target.getValPreallocatedBytes(); outputOffset = target.getValPreallocatedStart(); outputStart = outputOffset; System.arraycopy(oldBuffer, currentOutputStart, outputBuffer, outputOffset, currentOutputLength); outputOffset += currentOutputLength; } // finally copy the bytes writeCodepoint(replacement, outputBuffer, outputOffset, len); outputOffset += len; } target.setValPreallocated(row, outputOffset - outputStart); }
From source file:org.mgrover.hive.translate.GenericUDFTranslate.java
License:Apache License
/** * Pre-process the from and to strings populate {@link #replacementMap} and {@link #deletionSet}. * /*from w w w .j av a 2 s .c o m*/ * @param from * from string to be used for translation * @param to * to string to be used for translation */ private void populateMappings(Text from, Text to) { replacementMap.clear(); deletionSet.clear(); ByteBuffer fromBytes = ByteBuffer.wrap(from.getBytes(), 0, from.getLength()); ByteBuffer toBytes = ByteBuffer.wrap(to.getBytes(), 0, to.getLength()); // Traverse through the from string, one code point at a time while (fromBytes.hasRemaining()) { // This will also move the iterator ahead by one code point int fromCodePoint = Text.bytesToCodePoint(fromBytes); // If the to string has more code points, make sure to traverse it too if (toBytes.hasRemaining()) { int toCodePoint = Text.bytesToCodePoint(toBytes); // If the code point from from string already has a replacement or is to be deleted, we // don't need to do anything, just move on to the next code point if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) { continue; } replacementMap.put(fromCodePoint, toCodePoint); } else { // If the code point from from string already has a replacement or is to be deleted, we // don't need to do anything, just move on to the next code point if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) { continue; } deletionSet.add(fromCodePoint); } } }
From source file:org.mgrover.hive.translate.GenericUDFTranslate.java
License:Apache License
/** * Translates the input string based on {@link #replacementMap} and {@link #deletionSet} and * returns the translated string./*from w w w . j a va 2s. c om*/ * * @param input * input string to perform the translation on * @return translated string */ private String processInput(Text input) { StringBuilder resultBuilder = new StringBuilder(); // Obtain the byte buffer from the input string so we can traverse it code point by code point ByteBuffer inputBytes = ByteBuffer.wrap(input.getBytes(), 0, input.getLength()); // Traverse the byte buffer containing the input string one code point at a time while (inputBytes.hasRemaining()) { int inputCodePoint = Text.bytesToCodePoint(inputBytes); // If the code point exists in deletion set, no need to emit out anything for this code point. // Continue on to the next code point if (deletionSet.contains(inputCodePoint)) { continue; } Integer replacementCodePoint = replacementMap.get(inputCodePoint); // If a replacement exists for this code point, emit out the replacement and append it to the // output string. If no such replacement exists, emit out the original input code point char[] charArray = Character .toChars((replacementCodePoint != null) ? replacementCodePoint : inputCodePoint); resultBuilder.append(charArray); } String resultString = resultBuilder.toString(); return resultString; }