List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:diamondmapreduce.NLineRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; final Path file = split.getPath(); Configuration conf = context.getConfiguration(); this.maxLineLength = conf.getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(conf); start = split.getStart();//w w w. ja va 2s .c o m end = start + split.getLength(); boolean skipFirstLine = false; FSDataInputStream filein = fs.open(split.getPath()); if (start != 0) { skipFirstLine = true; --start; filein.seek(start); } in = new LineReader(filein, conf); if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:dz.lab.hdfs.SeekReadFile.java
/** * @param args//from www . j av a 2 s . c o m */ public static void main(String[] args) throws IOException { Path fileToRead = new Path("/tmp/quotes.csv"); // read configuration from core-site.xml available in the classpath (under /resources) FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream input = null; try { // start at position 0 input = fs.open(fileToRead); System.out.print("start position=" + input.getPos() + ":"); IOUtils.copyBytes(input, System.out, 4096, false); // seek to position 11 input.seek(11); System.out.print("start position=" + input.getPos() + ":"); IOUtils.copyBytes(input, System.out, 4096, false); // seek back to position 0 input.seek(11); System.out.print("start position=" + input.getPos() + ":"); IOUtils.copyBytes(input, System.out, 4096, false); } finally { IOUtils.closeStream(input); } }
From source file:edu.ecnu.idse.TrajStore.util.FileUtil.java
License:Open Source License
/** * Copies a part of a file from a remote file system (e.g., HDFS) to a local * file. Returns a path to a local temporary file. * //from w ww . ja va 2 s . c o m * @param conf * @param split * @return * @throws IOException */ public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException { FileSystem fs = split.getPath().getFileSystem(conf); // Special case of a local file. Skip copying the file if (fs instanceof LocalFileSystem && split.getStart() == 0) return split.getPath().toUri().getPath(); // Length of input file. We do not depend on split.length because it is // not // set by input format for performance reason. Setting it in the input // format would cost a lot of time because it runs on the client machine // while the record reader runs on slave nodes in parallel long length = fs.getFileStatus(split.getPath()).getLen(); FSDataInputStream in = fs.open(split.getPath()); in.seek(split.getStart()); ReadableByteChannel rbc = Channels.newChannel(in); // Prepare output file for write File tempFile = File.createTempFile(split.getPath().getName(), "tmp"); FileOutputStream out = new FileOutputStream(tempFile); out.getChannel().transferFrom(rbc, 0, length); rbc.close(); out.close(); return tempFile.getAbsolutePath(); }
From source file:edu.umn.cs.spatialHadoop.hdf.HDFFile.java
License:Open Source License
/** * Initializes a new HDF file from an input stream. This stream should not * be closed as long as the HDF file is used. Closing this HDFFile will * also close the underlying stream./* w w w. jav a 2 s .c o m*/ * @throws IOException * */ public HDFFile(FSDataInputStream inStream) throws IOException { this.inStream = inStream; byte[] signature = new byte[4]; inStream.readFully(signature); if (!Arrays.equals(signature, HDFMagicNumber)) throw new RuntimeException("Not a valid HDF file"); int blockSize = inStream.readUnsignedShort(); int nextBlock; do { // Keep track of the location of the next block nextBlock = inStream.readInt(); this.readBlock(blockSize); if (nextBlock != 0) { inStream.seek(nextBlock); blockSize = inStream.readShort(); } } while (nextBlock > 0); }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Perform a selection query that retrieves all points in the given range. * The range is specified in the two-dimensional array positions. * @param in//from www.j a v a 2 s . c om * @param query_mbr * @param output * @return number of matched records * @throws IOException */ public static int selectionQuery(FSDataInputStream in, Rectangle query_mbr, ResultCollector<PointValue> output) throws IOException { long treeStartPosition = in.getPos(); int numOfResults = 0; int resolution = in.readInt(); short fillValue = in.readShort(); int cardinality = in.readInt(); long[] timestamps = new long[cardinality]; for (int i = 0; i < cardinality; i++) timestamps[i] = in.readLong(); Vector<Integer> selectedStarts = new Vector<Integer>(); Vector<Integer> selectedEnds = new Vector<Integer>(); StockQuadTree stockQuadTree = getOrCreateStockQuadTree(resolution); // Nodes to be searched. Contains node positions in the array of nodes Stack<Integer> nodes_2b_searched = new Stack<Integer>(); nodes_2b_searched.add(0); // Root node (ID=1) Rectangle node_mbr = new Rectangle(); while (!nodes_2b_searched.isEmpty()) { int node_pos = nodes_2b_searched.pop(); stockQuadTree.getNodeMBR(node_pos, node_mbr); if (query_mbr.contains(node_mbr)) { // Add this node to the selection list and stop this branch if (!selectedEnds.isEmpty() && selectedEnds.lastElement() == stockQuadTree.nodesStartPosition[node_pos]) { // Merge with an adjacent range selectedEnds.set(selectedEnds.size() - 1, stockQuadTree.nodesEndPosition[node_pos]); } else { // add a new range selectedStarts.add(stockQuadTree.nodesStartPosition[node_pos]); selectedEnds.add(stockQuadTree.nodesEndPosition[node_pos]); } numOfResults += stockQuadTree.nodesEndPosition[node_pos] - stockQuadTree.nodesStartPosition[node_pos]; } else if (query_mbr.intersects(node_mbr)) { int first_child_id = stockQuadTree.nodesID[node_pos] * 4 + 0; int first_child_pos = Arrays.binarySearch(stockQuadTree.nodesID, first_child_id); if (first_child_pos < 0) { // No children. Hit a leaf node // Scan and add matching points only java.awt.Point record_coords = new Point(); for (int record_pos = stockQuadTree.nodesStartPosition[node_pos]; record_pos < stockQuadTree.nodesEndPosition[node_pos]; record_pos++) { stockQuadTree.getRecordCoords(record_pos, record_coords); if (query_mbr.contains(record_coords)) { // matched a record. if (!selectedEnds.isEmpty() && selectedEnds.lastElement() == record_pos) { // Merge with an adjacent range selectedEnds.set(selectedEnds.size() - 1, record_pos + 1); } else { // Add a new range of unit width selectedStarts.add(record_pos); selectedEnds.add(record_pos + 1); } numOfResults++; } } } else { // Non-leaf node. Add all children to the list of nodes to search // Add in reverse order to the stack so that results come in sorted order nodes_2b_searched.add(first_child_pos + 3); nodes_2b_searched.add(first_child_pos + 2); nodes_2b_searched.add(first_child_pos + 1); nodes_2b_searched.add(first_child_pos + 0); } } } if (output != null) { PointValue returnValue = new PointValue(); long dataStartPosition = treeStartPosition + getValuesStartOffset(cardinality); // Return all values in the selected ranges for (int iRange = 0; iRange < selectedStarts.size(); iRange++) { int treeStart = selectedStarts.get(iRange); int treeEnd = selectedEnds.get(iRange); long startPosition = dataStartPosition + selectedStarts.get(iRange) * cardinality * 2; in.seek(startPosition); for (int treePos = treeStart; treePos < treeEnd; treePos++) { // Retrieve the coords for the point at treePos stockQuadTree.getRecordCoords(treePos, returnValue); // Read all entries at current position for (int iValue = 0; iValue < cardinality; iValue++) { short value = in.readShort(); if (value != fillValue) { returnValue.value = value; returnValue.timestamp = timestamps[iValue]; output.collect(returnValue); } } } } } return numOfResults; }
From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java
License:Open Source License
/** * Perform a selection query that retrieves all points in the given range. * The range is specified in the two-dimensional array positions. * @param in/*from www. ja v a2 s. c om*/ * @param query_mbr * @return * @throws IOException */ public static Node aggregateQuery(FSDataInputStream in, Rectangle query_mbr) throws IOException { long treeStartPosition = in.getPos(); Node result = new Node(); int numOfSelectedRecords = 0; int resolution = in.readInt(); short fillValue = in.readShort(); int cardinality = in.readInt(); final Vector<Integer> selectedNodesPos = new Vector<Integer>(); final Vector<Integer> selectedStarts = new Vector<Integer>(); final Vector<Integer> selectedEnds = new Vector<Integer>(); StockQuadTree stockQuadTree = getOrCreateStockQuadTree(resolution); // Nodes to be searched. Contains node positions in the array of nodes Stack<Integer> nodes_2b_searched = new Stack<Integer>(); nodes_2b_searched.add(0); // Root node (ID=1) Rectangle node_mbr = new Rectangle(); while (!nodes_2b_searched.isEmpty()) { int node_pos = nodes_2b_searched.pop(); stockQuadTree.getNodeMBR(node_pos, node_mbr); if (query_mbr.contains(node_mbr)) { // Add this node to the selection list and stop this branch selectedNodesPos.add(node_pos); } else if (query_mbr.intersects(node_mbr)) { int first_child_id = stockQuadTree.nodesID[node_pos] * 4 + 0; int first_child_pos = Arrays.binarySearch(stockQuadTree.nodesID, first_child_id); if (first_child_pos < 0) { // No children. Hit a leaf node // Scan and add matching points only java.awt.Point record_coords = new Point(); for (int record_pos = stockQuadTree.nodesStartPosition[node_pos]; record_pos < stockQuadTree.nodesEndPosition[node_pos]; record_pos++) { stockQuadTree.getRecordCoords(record_pos, record_coords); if (query_mbr.contains(record_coords)) { // matched a record. if (!selectedEnds.isEmpty() && selectedEnds.lastElement() == record_pos) { // Merge with an adjacent range selectedEnds.set(selectedEnds.size() - 1, record_pos + 1); } else { // Add a new range of unit width selectedStarts.add(record_pos); selectedEnds.add(record_pos + 1); } numOfSelectedRecords++; } } } else { // Non-leaf node. Add all children to the list of nodes to search // Add in reverse order to the stack so that results come in sorted order nodes_2b_searched.add(first_child_pos + 3); nodes_2b_searched.add(first_child_pos + 2); nodes_2b_searched.add(first_child_pos + 1); nodes_2b_searched.add(first_child_pos + 0); } } } // Result 1: Accumulate all values // Sort disk offsets to eliminate backward seeks if (!selectedStarts.isEmpty()) { LOG.debug("Aggregate query selected " + selectedNodesPos.size() + " nodes and " + numOfSelectedRecords + " records"); final IndexedSortable sortable = new IndexedSortable() { @Override public int compare(int i, int j) { return selectedStarts.get(i) - selectedStarts.get(j); } @Override public void swap(int i, int j) { int temp = selectedStarts.get(i); selectedStarts.set(i, selectedStarts.get(j)); selectedStarts.set(j, temp); temp = selectedEnds.get(i); selectedEnds.set(i, selectedEnds.get(j)); selectedEnds.set(j, temp); } }; new QuickSort().sort(sortable, 0, selectedStarts.size()); long dataStartPosition = getValuesStartOffset(cardinality); Point resultCoords = new Point(); // Return all values in the selected ranges for (int iRange = 0; iRange < selectedStarts.size(); iRange++) { int treeStart = selectedStarts.get(iRange); int treeEnd = selectedEnds.get(iRange); long startPosition = dataStartPosition + selectedStarts.get(iRange) * cardinality * 2; in.seek(startPosition); for (int treePos = treeStart; treePos < treeEnd; treePos++) { // Retrieve the coords for the point at treePos stockQuadTree.getRecordCoords(treePos, resultCoords); // Read all entries at current position for (int iValue = 0; iValue < cardinality; iValue++) { short value = in.readShort(); if (value != fillValue) result.accumulate(value); } } } } // Result 2: Accumulate all nodes if (!selectedNodesPos.isEmpty()) { long nodesStartPosition = treeStartPosition + getNodesStartOffset(resolution, cardinality); // Sort node positions to eliminate backward seeks IndexedSortable nodeSortable = new IndexedSortable() { @Override public int compare(int i, int j) { return selectedNodesPos.get(i) - selectedNodesPos.get(j); } @Override public void swap(int i, int j) { int temp = selectedNodesPos.get(i); selectedNodesPos.set(i, selectedNodesPos.get(j)); selectedNodesPos.set(j, temp); } }; new QuickSort().sort(nodeSortable, 0, selectedNodesPos.size()); Node selectedNode = new Node(); for (int node_pos : selectedNodesPos) { long nodePosition = nodesStartPosition + node_pos * NodeSize; in.seek(nodePosition); selectedNode.readFields(in); result.accumulate(selectedNode); } } return result; }
From source file:edu.umn.cs.spatialHadoop.util.FileUtil.java
License:Open Source License
/** * Copies a part of a file from a remote file system (e.g., HDFS) to a local * file. Returns a path to a local temporary file. * /* www . java2 s . com*/ * @param conf * @param split * @return * @throws IOException */ public static String copyFileSplit(Configuration conf, FileSplit split) throws IOException { FileSystem fs = split.getPath().getFileSystem(conf); // Special case of a local file. Skip copying the file if (fs instanceof LocalFileSystem && split.getStart() == 0) return split.getPath().toUri().getPath(); File destFile = File.createTempFile(split.getPath().getName(), "tmp"); // Special handling for HTTP files for more efficiency /*if (fs instanceof HTTPFileSystem && split.getStart() == 0) { URL website = split.getPath().toUri().toURL(); ReadableByteChannel rbc = Channels.newChannel(website.openStream()); FileOutputStream fos = new FileOutputStream(destFile); fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); fos.close(); return destFile.getAbsolutePath(); }*/ // Length of input file. We do not depend on split.length because it is // not // set by input format for performance reason. Setting it in the input // format would cost a lot of time because it runs on the client machine // while the record reader runs on slave nodes in parallel long length = fs.getFileStatus(split.getPath()).getLen(); FSDataInputStream in = fs.open(split.getPath()); in.seek(split.getStart()); ReadableByteChannel rbc = Channels.newChannel(in); // Prepare output file for write FileOutputStream out = new FileOutputStream(destFile); out.getChannel().transferFrom(rbc, 0, length); in.close(); out.close(); return destFile.getAbsolutePath(); }
From source file:fi.tkk.ics.hadoop.bam.BAMRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { // This method should only be called once (see Hadoop API). However, // there seems to be disagreement between implementations that call // initialize() and Hadoop-BAM's own code that relies on // {@link BAMInputFormat} to call initialize() when the reader is // created. Therefore we add this check for the time being. if (isInitialized) close();//from ww w . ja v a2 s. c om isInitialized = true; final Configuration conf = ContextUtil.getConfiguration(ctx); final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(conf); this.stringency = SAMHeaderReader.getValidationStringency(conf); final FSDataInputStream in = fs.open(file); codec = new BAMRecordCodec(SAMHeaderReader.readSAMHeaderFrom(in, conf)); in.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(in, fs.getFileStatus(file).getLen(), file)); final long virtualStart = split.getStartVirtualOffset(); fileStart = virtualStart >>> 16; virtualEnd = split.getEndVirtualOffset(); bci.seek(virtualStart); codec.setInputStream(bci); if (BAMInputFormat.DEBUG_BAM_SPLITTER) { final long recordStart = virtualStart & 0xffff; System.err.println( "XXX inizialized BAMRecordReader byte offset: " + fileStart + " record offset: " + recordStart); } }
From source file:fi.tkk.ics.hadoop.bam.BCFRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { isBGZF = spl instanceof FileVirtualSplit; if (isBGZF) { final FileVirtualSplit split = (FileVirtualSplit) spl; final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx)); final FSDataInputStream inFile = fs.open(file); bci = new BlockCompressedInputStream(inFile); in = new PositionalBufferedStream(bci); initContigDict();/*from ww w .j a v a 2 s . c om*/ inFile.seek(0); bci = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(inFile, fs.getFileStatus(file).getLen(), file)); final long virtualStart = split.getStartVirtualOffset(), virtualEnd = split.getEndVirtualOffset(); this.fileStart = virtualStart >>> 16; this.length = (virtualEnd >>> 16) - fileStart; bci.seek(virtualStart); // Since PositionalBufferedStream does its own buffering, we have to // prevent it from going too far by using a BGZFLimitingStream. It // also allows nextKeyValue() to simply check for EOF instead of // looking at virtualEnd. in = new PositionalBufferedStream(new BGZFLimitingStream(bci, virtualEnd)); } else { final FileSplit split = (FileSplit) spl; this.fileStart = split.getStart(); this.length = split.getLength(); final Path file = split.getPath(); in = new PositionalBufferedStream(file.getFileSystem(ContextUtil.getConfiguration(ctx)).open(file)); initContigDict(); in.skip(fileStart - in.getPosition()); } }
From source file:fi.tkk.ics.hadoop.bam.VCFRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { final FileSplit split = (FileSplit) spl; this.length = split.getLength(); final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx)); final FSDataInputStream ins = fs.open(file); reader = new AsciiLineReader(ins); it = new AsciiLineReaderIterator(reader); final Object h = codec.readHeader(it); if (!(h instanceof FeatureCodecHeader) || !(((FeatureCodecHeader) h).getHeaderValue() instanceof VCFHeader)) throw new IOException("No VCF header found in " + file); final VCFHeader header = (VCFHeader) ((FeatureCodecHeader) h).getHeaderValue(); contigDict.clear();/*from w w w . j a v a 2 s . c o m*/ int i = 0; for (final VCFContigHeaderLine contig : header.getContigLines()) contigDict.put(contig.getID(), i++); // Note that we create a new reader here, so reader.getPosition() is 0 at // start regardless of the value of start. Hence getProgress() and // nextKeyValue() don't need to use start at all. final long start = split.getStart(); if (start != 0) { ins.seek(start - 1); reader = new AsciiLineReader(ins); reader.readLine(); // NOTE: skip incomplete line! it = new AsciiLineReaderIterator(reader); } else { // it seems that newer versions of the reader peek ahead one more line from the input long current_pos = it.getPosition(); ins.seek(0); reader = new AsciiLineReader(ins); it = new AsciiLineReaderIterator(reader); while (it.hasNext() && it.getPosition() <= current_pos && it.peek().startsWith("#")) { it.next(); } if (!it.hasNext() || it.getPosition() > current_pos) throw new IOException("Empty VCF file " + file); } }