List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:com.ricemap.spateDB.mapred.SpatialRecordReader.java
License:Apache License
/** * Initialize from a path and range/* w w w .j a v a 2s . co m*/ * @param job * @param s * @param l * @param p * @throws IOException */ public SpatialRecordReader(Configuration job, long s, long l, Path p) throws IOException { this.start = s; this.end = s + l; this.path = p; this.fs = this.path.getFileSystem(job); FSDataInputStream fileIn = fs.open(this.path); this.blockSize = fs.getFileStatus(this.path).getBlockSize(); this.cellMbr = new Prism(); LOG.info("Open a SpatialRecordReader to file: " + this.path); codec = new CompressionCodecFactory(job).getCodec(this.path); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = cIn; start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { in = codec.createInputStream(fileIn, decompressor); filePosition = fileIn; } } else { fileIn.seek(start); in = fileIn; filePosition = fileIn; } this.pos = start; this.maxShapesInOneRead = job.getInt(SpatialSite.MaxShapesInOneRead, 1000000); this.maxBytesInOneRead = job.getInt(SpatialSite.MaxBytesInOneRead, 32 * 1024 * 1024); initializeReader(); }
From source file:com.ricemap.spateDB.operations.RecordCount.java
License:Apache License
/** * Counts the approximate number of lines in a file by getting an approximate * average line length/*from ww w .j ava 2s. c om*/ * @param fs * @param file * @return * @throws IOException */ public static <T> long recordCountApprox(FileSystem fs, Path file) throws IOException { final long fileSize = fs.getFileStatus(file).getLen(); final FSDataInputStream in = fs.open(file); Estimator<Long> lineEstimator = new Estimator<Long>(0.05); lineEstimator.setRandomSample(new Estimator.RandomSample() { @Override public double next() { int lineLength = 0; try { long randomFilePosition = (long) (Math.random() * fileSize); in.seek(randomFilePosition); // Skip the rest of this line byte lastReadByte; do { lastReadByte = in.readByte(); } while (lastReadByte != '\n' && lastReadByte != '\r'); while (in.getPos() < fileSize - 1) { lastReadByte = in.readByte(); if (lastReadByte == '\n' || lastReadByte == '\r') { break; } lineLength++; } } catch (IOException e) { e.printStackTrace(); } return lineLength + 1; } }); lineEstimator.setUserFunction(new Estimator.UserFunction<Long>() { @Override public Long calculate(double x) { return (long) (fileSize / x); } }); lineEstimator.setQualityControl(new Estimator.QualityControl<Long>() { @Override public boolean isAcceptable(Long y1, Long y2) { return (double) Math.abs(y2 - y1) / Math.min(y1, y2) < 0.01; } }); Estimator.Range<Long> lineCount = lineEstimator.getEstimate(); in.close(); return (lineCount.limit1 + lineCount.limit2) / 2; }
From source file:com.ricemap.spateDB.operations.Sampler.java
License:Apache License
/** * Reads a sample of the given file and returns the number of items read. * /*from w w w . j a v a 2 s.com*/ * @param fs * @param file * @param count * @return * @throws IOException */ public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs, Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException { ArrayList<Path> data_files = new ArrayList<Path>(); for (Path file : files) { if (fs.getFileStatus(file).isDir()) { // Directory, process all data files in this directory (visible files) FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter); for (FileStatus f : fileStatus) { data_files.add(f.getPath()); } } else { // File, process this file data_files.add(file); } } files = data_files.toArray(new Path[data_files.size()]); ResultCollector<T> converter = createConverter(output, inObj, outObj); long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes long total_length = 0; for (int i_file = 0; i_file < files.length; i_file++) { files_start_offset[i_file] = total_length; total_length += fs.getFileStatus(files[i_file]).getLen(); } files_start_offset[files.length] = total_length; // Generate offsets to read from and make sure they are ordered to minimize // seeks between different HDFS blocks Random random = new Random(seed); long[] offsets = new long[count]; for (int i = 0; i < offsets.length; i++) { if (total_length == 0) offsets[i] = 0; else offsets[i] = Math.abs(random.nextLong()) % total_length; } Arrays.sort(offsets); int record_i = 0; // Number of records read so far int records_returned = 0; int file_i = 0; // Index of the current file being sampled while (record_i < count) { // Skip to the file that contains the next sample while (offsets[record_i] > files_start_offset[file_i + 1]) file_i++; // Open a stream to the current file and use it to read all samples // in this file FSDataInputStream current_file_in = fs.open(files[file_i]); long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i]; // The start and end offsets of data within this block // offsets are calculated relative to file start long data_start_offset = 0; if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) { // This file is an RTree file. Update the start offset to point // to the first byte after the header data_start_offset = 8 + RTree.getHeaderSize(current_file_in); } // Get the end offset of data by searching for the beginning of the // last line long data_end_offset = current_file_size; // Skip the last line too to ensure to ensure that the mapped position // will be before some line in the block current_file_in.seek(data_end_offset); data_end_offset = Tail.tail(current_file_in, 1, null, null); long file_data_size = data_end_offset - data_start_offset; // Keep sampling as long as records offsets are within this file while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) { offsets[record_i] -= files_start_offset[file_i]; // Map file position to element index in this tree assuming fixed // size records long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size + data_start_offset; current_file_in.seek(element_offset_in_file); LineReader reader = new LineReader(current_file_in, 4096); // Read the first line after that offset Text line = new Text(); reader.readLine(line); // Skip the rest of the current line reader.readLine(line); // Read next line // Report this element to output if (converter != null) { inObj.fromText(line); converter.collect(inObj); } record_i++; records_returned++; } current_file_in.close(); } return records_returned; }
From source file:com.ricemap.spateDB.operations.Tail.java
License:Apache License
/** * Reads a maximum of n lines from the stream starting from its current * position and going backward.//w w w .j a v a 2 s .c om * * @param in - An input stream. It'll be scanned from its current position * backward till position 0 * @param n - Maximum number of lines to return * @param stockObject - An object used to deserialize lines read. It can * be set to <code>null</code> if output is also <code>null</code>. In this * case, nothing is reported to the output. * @param output - An output collector used to report lines read. * @return - The position of the beginning of the earliest line read from * buffer. * @throws IOException */ public static <T extends TextSerializable> long tail(FSDataInputStream in, int n, T stockObject, ResultCollector<T> output) throws IOException { int lines_read = 0; long end = in.getPos(); long offset_of_last_eol = end; long last_read_byte = end; LongWritable line_offset = new LongWritable(); Text read_line = new Text(); Text remainder_from_last_buffer = new Text(); byte[] buffer = new byte[4096]; while (last_read_byte > 0 && lines_read < n) { // Read next chunk from the back long first_byte_to_read = (last_read_byte - 1) - (last_read_byte - 1) % buffer.length; in.seek(first_byte_to_read); int bytes_to_read = (int) (last_read_byte - first_byte_to_read); in.read(buffer, 0, bytes_to_read); last_read_byte = first_byte_to_read; // Iterate over bytes in this buffer int i_last_byte_consumed_in_buffer = bytes_to_read; int i_last_byte_examined_in_buffer = bytes_to_read; while (i_last_byte_examined_in_buffer > 0 && lines_read < n) { byte byte_examined = buffer[--i_last_byte_examined_in_buffer]; if (byte_examined == '\n' || byte_examined == '\r') { // Found an end of line character // Report this to output unless it's empty long offset_of_this_eol = first_byte_to_read + i_last_byte_examined_in_buffer; if (offset_of_last_eol - offset_of_this_eol > 1) { if (output != null) { read_line.clear(); // +1 is to skip the EOL at the beginning read_line.append(buffer, i_last_byte_examined_in_buffer + 1, i_last_byte_consumed_in_buffer - (i_last_byte_examined_in_buffer + 1)); // Also append bytes remaining from last buffer if (remainder_from_last_buffer.getLength() > 0) { read_line.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength()); } line_offset.set(offset_of_this_eol + 1); stockObject.fromText(read_line); output.collect(stockObject); } lines_read++; remainder_from_last_buffer.clear(); } i_last_byte_consumed_in_buffer = i_last_byte_examined_in_buffer; offset_of_last_eol = offset_of_this_eol; } } if (i_last_byte_consumed_in_buffer > 0) { // There are still some bytes not consumed in buffer if (remainder_from_last_buffer.getLength() == 0) { // Store whatever is remaining in remainder_from_last_buffer remainder_from_last_buffer.append(buffer, 0, i_last_byte_consumed_in_buffer); } else { // Prepend remaining bytes to Text Text t = new Text(); t.append(buffer, 0, i_last_byte_consumed_in_buffer); t.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength()); remainder_from_last_buffer = t; } } } if (lines_read < n && remainder_from_last_buffer.getLength() > 0) { // There is still one last line needs to be reported lines_read++; if (output != null) { read_line = remainder_from_last_buffer; line_offset.set(0); stockObject.fromText(read_line); output.collect(stockObject); } offset_of_last_eol = -1; } return offset_of_last_eol + 1; }
From source file:com.ricemap.spateDB.operations.Tail.java
License:Apache License
/** * Reads a maximum of n non-empty lines from the end of the given file. * The position of the earliest line read is returned * @param fs//w ww .jav a 2 s .com * @param file * @param n * @param stockObject * @param output * @return * @throws IOException */ public static <T extends TextSerializable> long tail(FileSystem fs, Path file, int n, T stockObject, ResultCollector<T> output) throws IOException { FSDataInputStream in = null; try { in = fs.open(file); long length = fs.getFileStatus(file).getLen(); in.seek(length); return tail(in, n, stockObject, output); } finally { if (in != null) in.close(); } }
From source file:com.rockstor.core.io.ChunkReader.java
License:Apache License
private static void align_read(FSDataInputStream in, long offset) throws IOException { int pedding_bytes = (int) (offset & ALIGN_MASK); if (pedding_bytes != 0) { in.seek(offset + ALIGN_BYTES - pedding_bytes); }//w w w .j a v a 2s. c o m }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.SubmittedJob.java
License:Apache License
@SuppressWarnings("unchecked") private static <T> T getSplitDetails(FSDataInputStream inFile, long offset, Configuration configuration) throws IOException { inFile.seek(offset); String className = StringInterner.weakIntern(Text.readString(inFile)); Class<T> cls;// ww w . ja v a 2 s . c om try { cls = (Class<T>) configuration.getClassByName(className); } catch (ClassNotFoundException ce) { IOException wrap = new IOException("Split class " + className + " not found"); wrap.initCause(ce); throw wrap; } SerializationFactory factory = new SerializationFactory(configuration); Deserializer<T> deserializer = (Deserializer<T>) factory.getDeserializer(cls); deserializer.open(inFile); T split = deserializer.deserialize(null); return split; }
From source file:com.taobao.datax.plugins.common.DFSUtils.java
License:Open Source License
/** * Check file type in hdfs./* w ww . j av a 2 s . c o m*/ * * @param fs * handle of {@link FileSystem} * * @param path * hdfs {@link Path} * * @param conf * {@link Configuration} * * @return {@link HdfsFileType} TXT, TXT_COMP, SEQ * */ public static HdfsFileType checkFileType(FileSystem fs, Path path, Configuration conf) throws IOException { FSDataInputStream is = null; try { is = fs.open(path); /* file is empty, use TXT readerup */ if (0 == is.available()) { return HdfsFileType.TXT; } switch (is.readShort()) { case 0x5345: if (is.readByte() == 'Q') { // TODO: add RCFile return HdfsFileType.SEQ; } default: is.seek(0); CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecFactory.getCodec(path); if (null == codec) return HdfsFileType.TXT; else { return HdfsFileType.COMP_TXT; } } } catch (IOException e) { throw e; } finally { if (null != is) { try { is.close(); } catch (Exception ex) { } } } }
From source file:com.tgam.hadoop.mapred.EscapedLineRecordReader.java
License:Apache License
public EscapedLineRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w ww.j a v a2 s . c om end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.OmnitureDataFileRecordReader.java
License:Open Source License
public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;//www . j a v a2 s . c o m } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }