List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java
License:Open Source License
public static OGCGeometry consumeGeometryESRI(Text text, char separator) { // Check whether this text is a Well Known Text (WKT) or a hexed string boolean wkt = false; byte[] bytes = text.getBytes(); int length = text.getLength(); int i_shape = 0; while (!wkt && i_shape < ShapeNames.length) { byte[] shapeName = ShapeNames[i_shape]; if (length > shapeName.length) { int i = 0; while (i < shapeName.length && shapeName[i] == bytes[i]) i++;/*w w w . j a v a 2s. c o m*/ if (i == shapeName.length) { wkt = true; break; } } i_shape++; } // Look for the terminator of the shape text int i1 = 0; if (bytes[i1] == '\'' || bytes[i1] == '\"') { separator = (char) bytes[i1++]; } int i2 = i1; while (i2 < length && bytes[i2] != separator) i2++; String str = new String(bytes, i1, i2 - i1); // Remove consumed bytes from the text text.set(bytes, i2, text.getLength() - i2); OGCGeometry geom = parseText(str); return geom; }
From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java
License:Open Source License
public static synchronized Geometry consumeGeometryJTS(Text text, char separator) { // Check whether this text is a Well Known Text (WKT) or a hexed string boolean wkt = false; byte[] bytes = text.getBytes(); int length = text.getLength(); Geometry geom;//from w w w . ja va2 s .co m int i1, i2; // Start and end offset of the geometry being parsed int i_next; // Beginning of the next field boolean isWKT = false; boolean isHex = false; if (bytes[0] == '\'' || bytes[0] == '\"') { // A quoted string. Find terminating quote and trim the quotes i1 = 1; i2 = 2; while (i2 < length && bytes[i2] != bytes[0]) i2++; if (i2 == length) throw new RuntimeException("Unterminated quoted string"); i_next = i2 + 1; i2--; // Back one step to remove the terminating quote isWKT = true; // Assume any quoted string to be WKT } else { // Not a quoted string, check if the type is WKT int i_shape = 0; while (!wkt && i_shape < ShapeNames.length) { byte[] shapeName = ShapeNames[i_shape]; if (length > shapeName.length) { int i = 0; while (i < shapeName.length && shapeName[i] == bytes[i]) i++; if (i == shapeName.length) { wkt = true; break; } } i_shape++; } if (i_shape < ShapeNames.length) { isWKT = true; // Look for the terminator of the shape text i1 = 0; i2 = 1; // Search for the first open parenthesis while (i2 < length && bytes[i2] != '(') i2++; if (i2 < length) i2++; // Skip the open parenthesis itself int nesting = 1; while (i2 < length && nesting > 0) { if (bytes[i2] == '(') nesting++; else if (bytes[i2] == ')') nesting--; i2++; } i_next = i2 + 1; } else { // Check if the type is hex-encoded WKB i1 = 0; i2 = 0; while (i2 < length && IsHex[bytes[i2]]) i2++; isHex = i2 > 1; i_next = i2; } } String geom_text = new String(bytes, i1, i2); try { if (isWKT) { geom = wktReader.read(geom_text); } else if (isHex) { byte[] binary = hexToBytes(geom_text); geom = wkbReader.read(binary); } else { geom = null; } } catch (ParseException e) { throw new RuntimeException(String.format("Error parsing '%s'", geom_text), e); } // Remove consumed bytes from the text if (i_next >= text.getLength()) text.clear(); else { if (bytes[i_next] == separator) i_next++; text.set(bytes, i_next, length - i_next); } return geom; }
From source file:edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java
License:Open Source License
/** * Reads the next line from input and return true if a line was read. * If no more lines are available in this split, a false is returned. * @param value//from w w w .j a va 2s . c om * @return * @throws IOException */ protected boolean nextLine(Text value) throws IOException { if (blockType == BlockType.RTREE && pos == 8) { // File is positioned at the RTree header // Skip the header and go to first data object in file pos += RTree.skipHeader(in); LOG.info("Skipped R-tree to position: " + pos); // Reinitialize record reader at the new position lineReader = new LineReader(in); } while (getFilePosition() <= end) { value.clear(); int b = 0; if (buffer != null) { // Read the first line encountered in buffer int eol = RTree.skipToEOL(buffer, 0); b += eol; value.append(buffer, 0, eol); if (eol < buffer.length) { // There are still some bytes remaining in buffer byte[] tmp = new byte[buffer.length - eol]; System.arraycopy(buffer, eol, tmp, 0, tmp.length); buffer = tmp; } else { buffer = null; } // Check if a complete line has been read from the buffer byte last_byte = value.getBytes()[value.getLength() - 1]; if (last_byte == '\n' || last_byte == '\r') return true; } // Read the first line from stream Text temp = new Text(); b += lineReader.readLine(temp); if (b == 0) { // Indicates an end of stream return false; } pos += b; // Append the part read from stream to the part extracted from buffer value.append(temp.getBytes(), 0, temp.getLength()); if (value.getLength() > 1) { // Read a non-empty line. Note that end-of-line character is included return true; } } // Reached end of file return false; }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java
License:Open Source License
/** * Reads the next line from input and return true if a line was read. * If no more lines are available in this split, a false is returned. * @param value The text object to fill with the next line * @return <code>true</code> if a line was read; <code>false</code> otherwise. * @throws IOException If an error occurs while reading from disk. *//*from w w w .j av a 2 s. c om*/ protected boolean nextLine(Text value) throws IOException { while (getPos() <= end) { value.clear(); int lineLength; // Read the first line from stream if ((lineLength = lineReader.readLine(value)) <= 0) { // Indicates an end of stream return false; } // Append the part read from stream to the part extracted from buffer bytesRead += lineLength; if (value.getLength() > 1) { // Read a non-empty line. Note that end-of-line character is included return true; } } // Reached end of file return false; }
From source file:edu.umn.cs.spatialHadoop.nasa.NASAPoint.java
License:Open Source License
@Override public void fromText(Text text) { super.fromText(text); byte[] bytes = text.getBytes(); text.set(bytes, 1, text.getLength() - 1); value = TextSerializerHelper.consumeInt(text, ','); timestamp = TextSerializerHelper.consumeLong(text, '\0'); }
From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java
License:Open Source License
public static Partition fileMBRLocal(Path[] inFiles, final OperationsParams params) throws IOException, InterruptedException { // 1- Split the input path/file to get splits that can be processed independently final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>(); Job job = Job.getInstance(params);/*from w w w .j a v a 2 s . com*/ SpatialInputFormat3.setInputPaths(job, inFiles); final List<org.apache.hadoop.mapreduce.InputSplit> splits = inputFormat.getSplits(job); int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors()); // 2- Process splits in parallel List<Map<String, Partition>> allMbrs = Parallel.forEach(splits.size(), new RunnableRange<Map<String, Partition>>() { @Override public Map<String, Partition> run(int i1, int i2) { Map<String, Partition> mbrs = new HashMap<String, Partition>(); for (int i = i1; i < i2; i++) { try { org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits .get(i); final RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat .createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, params); } else { throw new RuntimeException("Unknown record reader"); } Partition p = mbrs.get(fsplit.getPath().getName()); if (p == null) { p = new Partition(); p.filename = fsplit.getPath().getName(); p.cellId = p.filename.hashCode(); p.size = 0; p.recordCount = 0; p.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE); mbrs.put(p.filename, p); } Text temp = new Text2(); while (reader.nextKeyValue()) { Iterable<Shape> shapes = reader.getCurrentValue(); for (Shape s : shapes) { Rectangle mbr = s.getMBR(); if (mbr != null) p.expand(mbr); p.recordCount++; temp.clear(); s.toText(temp); p.size += temp.getLength() + 1; } } } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } } return mbrs; } }, parallelism); Map<String, Partition> mbrs = allMbrs.remove(allMbrs.size() - 1); for (Map<String, Partition> list : allMbrs) { for (Partition p1 : list.values()) { Partition p2 = mbrs.get(p1.filename); if (p2 != null) { p2.expand(p1); } else { mbrs.put(p1.filename, p1); } } } // Cache the final result, if needed for (Path inFile : inFiles) { FileSystem inFs = inFile.getFileSystem(params); if (!inFs.getFileStatus(inFile).isDir()) continue; Path gindex_path = new Path(inFile, "_master.heap"); // Answer has been already cached (may be by another job) if (inFs.exists(gindex_path)) continue; FileStatus[] files = inFs.listStatus(inFile, SpatialSite.NonHiddenFileFilter); PrintStream wktout = new PrintStream(inFs.create(new Path(inFile, "_heap.wkt"), false)); PrintStream gout = new PrintStream(inFs.create(gindex_path, false)); Text text = new Text2(); for (FileStatus file : files) { text.clear(); Partition p = mbrs.get(file.getPath().getName()); gout.println(p.toText(text).toString()); wktout.println(p.toWKT()); } wktout.close(); gout.close(); } // Return the final answer Partition finalResult = new Partition(); finalResult.size = finalResult.recordCount = 0; finalResult.x1 = finalResult.y1 = Double.MAX_VALUE; finalResult.x2 = finalResult.y2 = -Double.MAX_VALUE; for (Partition p2 : mbrs.values()) finalResult.expand(p2); return finalResult; }
From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java
License:Open Source License
/** * Read from the given stream until end-of-line is reached. * @param in - the input stream from where to read the line * @param line - the line that has been read from file not including EOL * @return - number of bytes read including EOL characters * @throws IOException /* w ww .ja va2 s.c o m*/ */ public static int readUntilEOL(InputStream in, Text line) throws IOException { final byte[] bufferBytes = new byte[1024]; int bufferLength = 0; // Length of the buffer do { if (bufferLength == bufferBytes.length) { // Buffer full. Copy to the output text line.append(bufferBytes, 0, bufferLength); bufferLength = 0; } if (bufferLength == 0) { // Read and skip any initial EOL characters do { bufferBytes[0] = (byte) in.read(); } while (bufferBytes[0] != -1 && (bufferBytes[0] == '\n' || bufferBytes[0] == '\r')); if (bufferBytes[0] != -1) bufferLength++; } else { bufferBytes[bufferLength++] = (byte) in.read(); } } while (bufferLength > 0 && bufferBytes[bufferLength - 1] != -1 && bufferBytes[bufferLength - 1] != '\n' && bufferBytes[bufferLength - 1] != '\r'); if (bufferLength > 0) { bufferLength--; line.append(bufferBytes, 0, bufferLength); } return line.getLength(); }
From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java
License:Open Source License
private static <T extends TextSerializable> int sampleLocalByCount(Path[] files, ResultCollector<T> output, OperationsParams params) throws IOException { ArrayList<Path> data_files = new ArrayList<Path>(); for (Path file : files) { FileSystem fs = file.getFileSystem(params); if (fs.getFileStatus(file).isDir()) { // Directory, process all data files in this directory (visible files) FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter); for (FileStatus f : fileStatus) { data_files.add(f.getPath()); }/*from w ww . jav a 2 s. co m*/ } else { // File, process this file data_files.add(file); } } files = data_files.toArray(new Path[data_files.size()]); TextSerializable inObj1, outObj1; inObj1 = OperationsParams.getTextSerializable(params, "shape", new Text2()); outObj1 = OperationsParams.getTextSerializable(params, "outshape", new Text2()); // Make the objects final to be able to use in the anonymous inner class final TextSerializable inObj = inObj1; final T outObj = (T) outObj1; ResultCollector<TextSerializable> converter = createConverter(output, inObj, outObj); long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes long total_length = 0; for (int i_file = 0; i_file < files.length; i_file++) { FileSystem fs = files[i_file].getFileSystem(params); files_start_offset[i_file] = total_length; total_length += fs.getFileStatus(files[i_file]).getLen(); } files_start_offset[files.length] = total_length; // Generate offsets to read from and make sure they are ordered to minimize // seeks between different HDFS blocks Random random = new Random(params.getLong("seed", System.currentTimeMillis())); long[] offsets = new long[params.getInt("count", 0)]; for (int i = 0; i < offsets.length; i++) { if (total_length == 0) offsets[i] = 0; else offsets[i] = Math.abs(random.nextLong()) % total_length; } Arrays.sort(offsets); int record_i = 0; // Number of records read so far int records_returned = 0; int file_i = 0; // Index of the current file being sampled while (record_i < offsets.length) { // Skip to the file that contains the next sample while (offsets[record_i] > files_start_offset[file_i + 1]) file_i++; long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i]; FileSystem fs = files[file_i].getFileSystem(params); ShapeLineRecordReader reader = new ShapeLineRecordReader(fs.getConf(), new FileSplit(files[file_i], 0, current_file_size, new String[] {})); Rectangle key = reader.createKey(); Text line = reader.createValue(); long pos = files_start_offset[file_i]; while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1] && reader.next(key, line)) { pos += line.getLength(); if (pos > offsets[record_i]) { // Passed the offset of record_i // Report this element to output if (converter != null) { inObj.fromText(line); converter.collect(inObj); } record_i++; records_returned++; } } reader.close(); // Skip any remaining records that were supposed to be read from this file // This case might happen if a generated random position was in the middle // of the last line. while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1]) record_i++; } return records_returned; }
From source file:edu.umn.cs.spatialHadoop.osm.OSMPoint.java
License:Open Source License
@Override public void fromText(Text text) { id = TextSerializerHelper.consumeLong(text, '\t'); x = TextSerializerHelper.consumeDouble(text, '\t'); y = TextSerializerHelper.consumeDouble(text, '\t'); if (text.getLength() > 0) TextSerializerHelper.consumeMap(text, tags); }
From source file:es.pic.astro.hadoop.io.BinaryOutputFormat.java
License:Apache License
/** * create the final out file, and output row by row. After one row is * appended, a configured row separator is appended * * @param jc// www . ja v a 2 s . c om * the job configuration file * @param outPath * the final output file to be created * @param valueClass * the value class used for create * @param isCompressed * whether the content is compressed or not * @param tableProperties * the tableProperties of this file's corresponding table * @param progress * progress used for status report * @return the RecordWriter */ @Override public RecordWriter getHiveRecordWriter(JobConf jc, Path outPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException { FileSystem fs = outPath.getFileSystem(jc); final OutputStream outStream = Utilities.createCompressedStream(jc, fs.create(outPath, progress), isCompressed); return new RecordWriter() { @Override public void write(Writable r) throws IOException { if (r instanceof Text) { Text tr = (Text) r; outStream.write(tr.getBytes(), 0, tr.getLength()); } else { // DynamicSerDe always writes out BytesWritable BytesWritable bw = (BytesWritable) r; outStream.write(bw.get(), 0, bw.getSize()); } } @Override public void close(boolean abort) throws IOException { outStream.close(); } }; }