List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void mutability() throws IOException { // vv TextTest-Mutability Text t = new Text("hadoop"); t.set("pig"); assertThat(t.getLength(), is(3)); assertThat(t.getBytes().length, is(3)); // ^^ TextTest-Mutability }// w w w . j a v a 2 s . c o m
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void byteArrayNotShortened() throws IOException { // vv TextTest-ByteArrayNotShortened Text t = new Text("hadoop"); t.set(/*[*/new Text("pig")/*]*/); assertThat(t.getLength(), is(3)); assertThat("Byte length not shortened", t.getBytes().length, /*[*/is(6)/*]*/); // ^^ TextTest-ByteArrayNotShortened }// w w w. j a va2 s . com
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * /*from www . j a va2 s . c o m*/ * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }
From source file:diamondmapreduce.NLineRecordReader.java
License:Apache License
@Override public boolean nextKeyValue() throws IOException, InterruptedException { if (key == null) { key = new LongWritable(); }//from www .j a v a2 s .co m key.set(pos); if (value == null) { value = new Text(); } value.clear(); final Text endline = new Text("\n"); int newSize = 0; for (int i = 0; i < NLINESTOPROCESS; i++) { Text v = new Text(); while (pos < end) { newSize = in.readLine(v, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); value.append(v.getBytes(), 0, v.getLength()); value.append(endline.getBytes(), 0, endline.getLength()); if (newSize == 0) { break; } pos += newSize; if (newSize < maxLineLength) { break; } } } if (newSize == 0) { key = null; value = null; return false; } else { return true; } }
From source file:eastcircle.terasort.TotalOrderPartitioner.java
License:Apache License
/** * Given a sorted set of cut points, build a trie that will find the correct * partition quickly./* w w w . jav a2s.c om*/ * @param splits the list of cut points * @param lower the lower bound of partitions 0..numPartitions-1 * @param upper the upper bound of partitions 0..numPartitions-1 * @param prefix the prefix that we have already checked against * @param maxDepth the maximum depth we will build a trie for * @return the trie node that will divide the splits correctly */ private static TrieNode buildTrie(Text[] splits, int lower, int upper, Text prefix, int maxDepth) { int depth = prefix.getLength(); if (depth >= maxDepth || lower == upper) { return new LeafTrieNode(depth, splits, lower, upper); } InnerTrieNode result = new InnerTrieNode(depth); Text trial = new Text(prefix); // append an extra byte on to the prefix trial.append(new byte[1], 0, 1); int currentBound = lower; for (int ch = 0; ch < 255; ++ch) { trial.getBytes()[depth] = (byte) (ch + 1); lower = currentBound; while (currentBound < upper) { if (splits[currentBound].compareTo(trial) >= 0) { break; } currentBound += 1; } trial.getBytes()[depth] = (byte) ch; result.child[ch] = buildTrie(splits, lower, currentBound, trial, maxDepth); } // pick up the rest trial.getBytes()[depth] = (byte) 255; result.child[255] = buildTrie(splits, currentBound, upper, trial, maxDepth); return result; }
From source file:edu.cmu.cs.in.hadoop.HoopKeyComparer.java
License:Open Source License
/** * *//*w w w.jav a 2 s. c o m*/ public int compare(Text o1, Text o2) { return compare(o1.getBytes(), 0, o1.getLength(), o2.getBytes(), 0, o2.getLength()); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void loadChunkPairs() { // clear chunk pairs mChunkPairs.clear();/* ww w . j av a2 s. com*/ // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // extract chunks from sentence mChunks.clear(); mChunkTokens.clear(); List<TratzParsedTokenWritable> tokens = sentence.getTokens(); Text lastNETag = new Text(); for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); byte chunkType = t.getChunkTag().getLength() > 0 ? t.getChunkTag().getBytes()[0] : 0; Text neTag = t.getNETag(); if (neTag.compareTo(lastNETag.getBytes(), 0, lastNETag.getLength()) != 0 || (neTag.getLength() == 1 && (neTag.getLength() > 0 && neTag.getBytes()[0] == 'O')) && (chunkType == 'B' || chunkType == 'O')) { if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } mChunkTokens.clear(); mChunkType.set(t.getChunkTag()); } mChunkTokens.add(t.getToken()); lastNETag.set(neTag); } // handle last chunk in sentence if (mChunkTokens.size() > 0) { // && mChunkType.getBytes()[0] != 'O') { Text chunk = createChunk(mChunkTokens, mChunkType); mChunks.add(chunk); } // generate adjacent (context, pattern) pairs for (int patternPos = 0; patternPos < mChunks.size() - 1; patternPos++) { Text leftPattern = new Text(); leftPattern.append(mChunks.get(patternPos).getBytes(), 0, mChunks.get(patternPos).getLength()); leftPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); addPair(mChunks.get(patternPos), leftPattern, mChunks.get(patternPos + 1)); Text rightPattern = new Text(); rightPattern.append(ADJACENT_PATTERN_NAME.getBytes(), 0, ADJACENT_PATTERN_NAME.getLength()); rightPattern.append(mChunks.get(patternPos + 1).getBytes(), 0, mChunks.get(patternPos + 1).getLength()); addPair(mChunks.get(patternPos), rightPattern, mChunks.get(patternPos + 1)); } // generate non-adjacent (context, pattern) pairs based on chunks for (int patternPos = 0; patternPos < mChunks.size(); patternPos++) { for (int leftSkip = 0; leftSkip <= mMaxSkipSize; leftSkip++) { if (patternPos - leftSkip - 1 < 0) { continue; } if (mOrContextStyle && !mRightOnlyContextStyle) { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), ContextPatternWritable.ASTERISK); } if (mOrContextStyle && mLeftOnlyContextStyle) { continue; } for (int rightSkip = 0; rightSkip <= mMaxSkipSize; rightSkip++) { if (patternPos + rightSkip + 1 >= mChunks.size()) { continue; } // construct (context, pattern) pair if (mOrContextStyle) { addPair(ContextPatternWritable.ASTERISK, mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } else { addPair(mChunks.get(patternPos - leftSkip - 1), mChunks.get(patternPos), mChunks.get(patternPos + rightSkip + 1)); } } } } // get iterator mChunkPairsIter = mChunkPairs.iterator(); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private void addPair(Text left, Text pattern, Text right) { ContextPatternWritable c;/* ww w . j a v a 2 s . c o m*/ // forward pattern c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(left, right)); c.setPattern(pattern); // add to chunk pairs mChunkPairs.add(c); // reverse pattern c = new ContextPatternWritable(); c.setContext(MavunoUtils.createContext(right, left)); c.setPattern(REVERSE_PATTERN); c.getPattern().append(pattern.getBytes(), 0, pattern.getLength()); // add to chunk pairs mChunkPairs.add(c); }
From source file:edu.isi.mavuno.extract.ChunkExtractor.java
License:Apache License
private Text createChunk(List<Text> terms, Text type) { Text t = new Text(); for (int i = 0; i < terms.size(); i++) { Text term = terms.get(i); t.append(term.getBytes(), 0, term.getLength()); if (i != terms.size() - 1) { t.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); }//w w w . j a va 2s .c o m } if (t.getLength() > 0 && !mSurfaceForms) { t.append(MavunoUtils.PIPE_BYTES, 0, MavunoUtils.PIPE_BYTES_LENGTH); if (type.getLength() > 2) { t.append(type.getBytes(), 2, type.getLength() - 2); } } return t; }
From source file:edu.isi.mavuno.extract.DIRTExtractor.java
License:Apache License
private void loadDependPairs() { // clear dependency pairs mDependPairs.clear();/* w ww. j a v a2s . co m*/ // get sentence SentenceWritable<TratzParsedTokenWritable> sentence = mSentIter.next(); // get sentence tokens List<TratzParsedTokenWritable> tokens = sentence.getTokens(); // get chunk ids int[] chunkIds = NLProcTools.getChunkIds(tokens); // get mapping from positions to chunks Text[] chunks = new Text[tokens.size()]; Text curChunk = null; for (int i = 0; i < tokens.size(); i++) { Text text = tokens.get(i).getToken(); if (i == 0 || (i > 0 && chunkIds[i] != chunkIds[i - 1])) { curChunk = new Text(text); } else { curChunk.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH); curChunk.append(text.getBytes(), 0, text.getLength()); } chunks[i] = curChunk; } // populate parse tree ArrayListOfInts[] children = new ArrayListOfInts[tokens.size() + 1]; for (int i = 0; i < tokens.size() + 1; i++) { children[i] = new ArrayListOfInts(); } for (int i = 0; i < tokens.size(); i++) { TratzParsedTokenWritable t = tokens.get(i); // ignore punctuation if (!t.getDependType().equals(PUNCTUATION_TYPE)) { children[t.getDependIndex()].add(i + 1); } } // extract (context, pattern) pairs from parse tree for (int i = 0; i < children[0].size(); i++) { extractPairs(children, children[0].get(i), tokens, chunks); } // get iterator mDependPairsIter = mDependPairs.iterator(); }