Example usage for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException

Source Link

Document

Seek to the given offset.

Usage

From source file:com.ricemap.spateDB.mapred.SpatialRecordReader.java

License:Apache License

/**
 * Initialize from a path and range/* w w  w  .j a v  a 2s . co m*/
 * @param job
 * @param s
 * @param l
 * @param p
 * @throws IOException
 */
public SpatialRecordReader(Configuration job, long s, long l, Path p) throws IOException {
    this.start = s;
    this.end = s + l;
    this.path = p;
    this.fs = this.path.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(this.path);
    this.blockSize = fs.getFileStatus(this.path).getBlockSize();
    this.cellMbr = new Prism();

    LOG.info("Open a SpatialRecordReader to file: " + this.path);

    codec = new CompressionCodecFactory(job).getCodec(this.path);

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = cIn;
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = codec.createInputStream(fileIn, decompressor);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = fileIn;
        filePosition = fileIn;
    }
    this.pos = start;
    this.maxShapesInOneRead = job.getInt(SpatialSite.MaxShapesInOneRead, 1000000);
    this.maxBytesInOneRead = job.getInt(SpatialSite.MaxBytesInOneRead, 32 * 1024 * 1024);

    initializeReader();
}

From source file:com.ricemap.spateDB.operations.RecordCount.java

License:Apache License

/**
 * Counts the approximate number of lines in a file by getting an approximate
 * average line length/*from  ww  w .j ava 2s.  c  om*/
 * @param fs
 * @param file
 * @return
 * @throws IOException
 */
public static <T> long recordCountApprox(FileSystem fs, Path file) throws IOException {
    final long fileSize = fs.getFileStatus(file).getLen();
    final FSDataInputStream in = fs.open(file);

    Estimator<Long> lineEstimator = new Estimator<Long>(0.05);
    lineEstimator.setRandomSample(new Estimator.RandomSample() {

        @Override
        public double next() {
            int lineLength = 0;
            try {
                long randomFilePosition = (long) (Math.random() * fileSize);
                in.seek(randomFilePosition);

                // Skip the rest of this line
                byte lastReadByte;
                do {
                    lastReadByte = in.readByte();
                } while (lastReadByte != '\n' && lastReadByte != '\r');

                while (in.getPos() < fileSize - 1) {
                    lastReadByte = in.readByte();
                    if (lastReadByte == '\n' || lastReadByte == '\r') {
                        break;
                    }
                    lineLength++;
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            return lineLength + 1;
        }
    });

    lineEstimator.setUserFunction(new Estimator.UserFunction<Long>() {
        @Override
        public Long calculate(double x) {
            return (long) (fileSize / x);
        }
    });

    lineEstimator.setQualityControl(new Estimator.QualityControl<Long>() {

        @Override
        public boolean isAcceptable(Long y1, Long y2) {
            return (double) Math.abs(y2 - y1) / Math.min(y1, y2) < 0.01;
        }
    });

    Estimator.Range<Long> lineCount = lineEstimator.getEstimate();
    in.close();

    return (lineCount.limit1 + lineCount.limit2) / 2;
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Reads a sample of the given file and returns the number of items read.
 * /*from  w w  w . j a  v a 2  s.com*/
 * @param fs
 * @param file
 * @param count
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleLocalByCount(FileSystem fs,
        Path[] files, int count, long seed, ResultCollector<O> output, T inObj, O outObj) throws IOException {
    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    ResultCollector<T> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(seed);
    long[] offsets = new long[count];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < count) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        // Open a stream to the current file and use it to read all samples
        // in this file
        FSDataInputStream current_file_in = fs.open(files[file_i]);
        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];

        // The start and end offsets of data within this block
        // offsets are calculated relative to file start
        long data_start_offset = 0;
        if (current_file_in.readLong() == SpatialSite.RTreeFileMarker) {
            // This file is an RTree file. Update the start offset to point
            // to the first byte after the header
            data_start_offset = 8 + RTree.getHeaderSize(current_file_in);
        }
        // Get the end offset of data by searching for the beginning of the
        // last line
        long data_end_offset = current_file_size;
        // Skip the last line too to ensure to ensure that the mapped position
        // will be before some line in the block
        current_file_in.seek(data_end_offset);
        data_end_offset = Tail.tail(current_file_in, 1, null, null);
        long file_data_size = data_end_offset - data_start_offset;

        // Keep sampling as long as records offsets are within this file
        while (record_i < count && (offsets[record_i] - files_start_offset[file_i]) < current_file_size) {
            offsets[record_i] -= files_start_offset[file_i];
            // Map file position to element index in this tree assuming fixed
            // size records
            long element_offset_in_file = offsets[record_i] * file_data_size / current_file_size
                    + data_start_offset;
            current_file_in.seek(element_offset_in_file);
            LineReader reader = new LineReader(current_file_in, 4096);
            // Read the first line after that offset
            Text line = new Text();
            reader.readLine(line); // Skip the rest of the current line
            reader.readLine(line); // Read next line

            // Report this element to output
            if (converter != null) {
                inObj.fromText(line);
                converter.collect(inObj);
            }
            record_i++;
            records_returned++;
        }
        current_file_in.close();
    }
    return records_returned;
}

From source file:com.ricemap.spateDB.operations.Tail.java

License:Apache License

/**
 * Reads a maximum of n lines from the stream starting from its current
 * position and going backward.//w w w  .j a v a 2 s .c  om
 * 
 * @param in - An input stream. It'll be scanned from its current position
 *   backward till position 0
 * @param n - Maximum number of lines to return
 * @param stockObject - An object used to deserialize lines read. It can
 *   be set to <code>null</code> if output is also <code>null</code>. In this
 *   case, nothing is reported to the output.
 * @param output - An output collector used to report lines read.
 * @return - The position of the beginning of the earliest line read from
 *   buffer.
 * @throws IOException
 */
public static <T extends TextSerializable> long tail(FSDataInputStream in, int n, T stockObject,
        ResultCollector<T> output) throws IOException {
    int lines_read = 0;
    long end = in.getPos();
    long offset_of_last_eol = end;
    long last_read_byte = end;

    LongWritable line_offset = new LongWritable();
    Text read_line = new Text();
    Text remainder_from_last_buffer = new Text();
    byte[] buffer = new byte[4096];

    while (last_read_byte > 0 && lines_read < n) {
        // Read next chunk from the back
        long first_byte_to_read = (last_read_byte - 1) - (last_read_byte - 1) % buffer.length;
        in.seek(first_byte_to_read);
        int bytes_to_read = (int) (last_read_byte - first_byte_to_read);
        in.read(buffer, 0, bytes_to_read);
        last_read_byte = first_byte_to_read;

        // Iterate over bytes in this buffer
        int i_last_byte_consumed_in_buffer = bytes_to_read;
        int i_last_byte_examined_in_buffer = bytes_to_read;
        while (i_last_byte_examined_in_buffer > 0 && lines_read < n) {
            byte byte_examined = buffer[--i_last_byte_examined_in_buffer];
            if (byte_examined == '\n' || byte_examined == '\r') {
                // Found an end of line character
                // Report this to output unless it's empty
                long offset_of_this_eol = first_byte_to_read + i_last_byte_examined_in_buffer;
                if (offset_of_last_eol - offset_of_this_eol > 1) {
                    if (output != null) {
                        read_line.clear();
                        // +1 is to skip the EOL at the beginning
                        read_line.append(buffer, i_last_byte_examined_in_buffer + 1,
                                i_last_byte_consumed_in_buffer - (i_last_byte_examined_in_buffer + 1));
                        // Also append bytes remaining from last buffer
                        if (remainder_from_last_buffer.getLength() > 0) {
                            read_line.append(remainder_from_last_buffer.getBytes(), 0,
                                    remainder_from_last_buffer.getLength());
                        }
                        line_offset.set(offset_of_this_eol + 1);
                        stockObject.fromText(read_line);
                        output.collect(stockObject);
                    }
                    lines_read++;
                    remainder_from_last_buffer.clear();
                }
                i_last_byte_consumed_in_buffer = i_last_byte_examined_in_buffer;
                offset_of_last_eol = offset_of_this_eol;
            }
        }
        if (i_last_byte_consumed_in_buffer > 0) {
            // There are still some bytes not consumed in buffer
            if (remainder_from_last_buffer.getLength() == 0) {
                // Store whatever is remaining in remainder_from_last_buffer
                remainder_from_last_buffer.append(buffer, 0, i_last_byte_consumed_in_buffer);
            } else {
                // Prepend remaining bytes to Text
                Text t = new Text();
                t.append(buffer, 0, i_last_byte_consumed_in_buffer);
                t.append(remainder_from_last_buffer.getBytes(), 0, remainder_from_last_buffer.getLength());
                remainder_from_last_buffer = t;
            }
        }
    }

    if (lines_read < n && remainder_from_last_buffer.getLength() > 0) {
        // There is still one last line needs to be reported
        lines_read++;
        if (output != null) {
            read_line = remainder_from_last_buffer;
            line_offset.set(0);
            stockObject.fromText(read_line);
            output.collect(stockObject);
        }
        offset_of_last_eol = -1;
    }

    return offset_of_last_eol + 1;
}

From source file:com.ricemap.spateDB.operations.Tail.java

License:Apache License

/**
 * Reads a maximum of n non-empty lines from the end of the given file.
 * The position of the earliest line read is returned 
 * @param fs//w  ww  .jav a  2 s .com
 * @param file
 * @param n
 * @param stockObject
 * @param output
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable> long tail(FileSystem fs, Path file, int n, T stockObject,
        ResultCollector<T> output) throws IOException {
    FSDataInputStream in = null;
    try {
        in = fs.open(file);
        long length = fs.getFileStatus(file).getLen();
        in.seek(length);
        return tail(in, n, stockObject, output);
    } finally {
        if (in != null)
            in.close();
    }
}

From source file:com.rockstor.core.io.ChunkReader.java

License:Apache License

private static void align_read(FSDataInputStream in, long offset) throws IOException {
    int pedding_bytes = (int) (offset & ALIGN_MASK);
    if (pedding_bytes != 0) {
        in.seek(offset + ALIGN_BYTES - pedding_bytes);
    }//w w  w .j a  v a  2s. c o  m
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.SubmittedJob.java

License:Apache License

@SuppressWarnings("unchecked")
private static <T> T getSplitDetails(FSDataInputStream inFile, long offset, Configuration configuration)
        throws IOException {
    inFile.seek(offset);
    String className = StringInterner.weakIntern(Text.readString(inFile));
    Class<T> cls;//  ww w .  ja  v  a 2 s  . c om
    try {
        cls = (Class<T>) configuration.getClassByName(className);
    } catch (ClassNotFoundException ce) {
        IOException wrap = new IOException("Split class " + className + " not found");
        wrap.initCause(ce);
        throw wrap;
    }
    SerializationFactory factory = new SerializationFactory(configuration);
    Deserializer<T> deserializer = (Deserializer<T>) factory.getDeserializer(cls);
    deserializer.open(inFile);
    T split = deserializer.deserialize(null);
    return split;
}

From source file:com.taobao.datax.plugins.common.DFSUtils.java

License:Open Source License

/**
 * Check file type in hdfs./* w  ww  . j av  a 2  s . c o  m*/
 * 
 * @param fs
 *            handle of {@link FileSystem}
 * 
 * @param path
 *            hdfs {@link Path}
 * 
 * @param conf
 *            {@link Configuration}
 * 
 * @return {@link HdfsFileType} TXT, TXT_COMP, SEQ
 * */
public static HdfsFileType checkFileType(FileSystem fs, Path path, Configuration conf) throws IOException {
    FSDataInputStream is = null;
    try {
        is = fs.open(path);
        /* file is empty, use TXT readerup */
        if (0 == is.available()) {
            return HdfsFileType.TXT;
        }

        switch (is.readShort()) {
        case 0x5345:
            if (is.readByte() == 'Q') {
                // TODO: add RCFile
                return HdfsFileType.SEQ;
            }
        default:
            is.seek(0);
            CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
            CompressionCodec codec = compressionCodecFactory.getCodec(path);
            if (null == codec)
                return HdfsFileType.TXT;
            else {
                return HdfsFileType.COMP_TXT;
            }
        }
    } catch (IOException e) {
        throw e;
    } finally {
        if (null != is) {
            try {
                is.close();
            } catch (Exception ex) {
            }

        }

    }
}

From source file:com.tgam.hadoop.mapred.EscapedLineRecordReader.java

License:Apache License

public EscapedLineRecordReader(Configuration job, FileSplit split) throws IOException {
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from   w ww.j  a v  a2 s  . c om
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:com.thinkbiganalytics.inputformat.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;//www .  j a  v a2  s .  c o  m
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}