Example usage for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException

Source Link

Document

Seek to the given offset.

Usage

From source file:org.mitre.bio.mapred.io.FastaRecordReader.java

License:Open Source License

public FastaRecordReader(FileSplit split, JobConf job) throws IOException {
    this.pushBackString = null;
    this.pushBackSize = 0;

    this.maxLineLength = job.getInt("io.file.buffer.size", // mapred.linereader.maxlength
            Integer.MAX_VALUE);// w w  w  .j av  a 2s .  c o m

    this.start = split.getStart();
    this.end = this.start + split.getLength();
    final Path file = split.getPath();

    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = this.compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        this.in = new LineReader(codec.createInputStream(fileIn), job);
        this.end = Long.MAX_VALUE;
    } else {
        /**
         * From LineRecordReader, what is this doing?
         */
        if (this.start != 0) {
            LOG.info("Skipping first line in split");
            skipFirstLine = true;
            --this.start;
            fileIn.seek(this.start);
        }
        this.in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) {
        /**
         * Skipping first line to re-established "start".
         */
        this.start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java

License:Open Source License

public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException {

    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;//from  w  w w  .  jav a  2 s .c o m
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.rassee.omniture.hadoop.mapreduce.OmnitureDataFileRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = start + split.getLength();
    final Path file = split.getPath();
    this.compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;//from   w  ww . j  av a 2s .  c  om
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        lineReader = new EscapedLineReader(fileIn, job);
    }
    if (skipFirstLine) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:org.seqdoop.hadoop_bam.VCFRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException {
    final FileSplit split = (FileSplit) spl;

    this.length = split.getLength();

    final Path file = split.getPath();
    final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx));

    final FSDataInputStream ins = fs.open(file);

    reader = new AsciiLineReader(ins);
    it = new AsciiLineReaderIterator(reader);

    final Object h = codec.readHeader(it);
    if (!(h instanceof FeatureCodecHeader) || !(((FeatureCodecHeader) h).getHeaderValue() instanceof VCFHeader))
        throw new IOException("No VCF header found in " + file);

    header = (VCFHeader) ((FeatureCodecHeader) h).getHeaderValue();

    contigDict.clear();/*from ww  w.  ja v  a2s . c  o m*/
    int i = 0;
    for (final VCFContigHeaderLine contig : header.getContigLines())
        contigDict.put(contig.getID(), i++);

    // Note that we create a new reader here, so reader.getPosition() is 0 at
    // start regardless of the value of start. Hence getProgress() and
    // nextKeyValue() don't need to use start at all.
    final long start = split.getStart();
    if (start != 0) {
        ins.seek(start - 1);
        reader = new AsciiLineReader(ins);
        reader.readLine(); // NOTE: skip incomplete line!
        it = new AsciiLineReaderIterator(reader);
    } else { // it seems that newer versions of the reader peek ahead one more line from the input
        long current_pos = it.getPosition();
        ins.seek(0);
        reader = new AsciiLineReader(ins);
        it = new AsciiLineReaderIterator(reader);
        while (it.hasNext() && it.getPosition() <= current_pos && it.peek().startsWith("#")) {
            it.next();
        }
        if (!it.hasNext() || it.getPosition() > current_pos)
            throw new IOException("Empty VCF file " + file);
    }
}

From source file:org.springframework.data.hadoop.fs.FsShell.java

License:Apache License

public Collection<String> text(String... uris) {
    Collection<String> texts = new PrettyPrintList<String>(new ListPrinter<String>() {

        @Override//from w  ww.  j ava  2 s  . co m
        public String toString(String e) throws Exception {
            return e + "\n";
        }
    });

    for (String uri : uris) {

        InputStream in = null;
        FSDataInputStream i = null;

        try {
            Path srcPat = new Path(uri);
            FileSystem srcFs = getFS(srcPat);

            for (Path src : FileUtil.stat2Paths(srcFs.globStatus(srcPat), srcPat)) {
                Assert.isTrue(srcFs.isFile(src), "Source must be a file");
                i = srcFs.open(src);
                switch (i.readShort()) {
                case 0x1f8b: // RFC 1952
                    i.seek(0);
                    in = new GZIPInputStream(i);
                    break;
                case 0x5345: // 'S' 'E'
                    if (i.readByte() == 'Q') {
                        i.close();
                        in = new TextRecordInputStream(src, srcFs, configuration);
                    }
                    break;
                default:
                    in = i;
                    break;
                }
                i.seek(0);
                texts.add(getContent(in));
            }
        } catch (IOException ex) {
            throw new HadoopException("Cannot read " + uri + ";" + ex.getMessage(), ex);
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(i);
        }
    }
    return texts;
}

From source file:org.springframework.data.hadoop.store.AbstractStorage.java

License:Apache License

/**
 * Gets the input stream for input split.
 * /*w w  w.ja  v a  2 s  . com*/
 * @param split the split
 * @return the input stream
 * @throws IOException Signals that an I/O exception has occurred.
 */
protected synchronized StreamsHolder<InputStream> getInput(InputSplit split) throws IOException {
    StreamsHolder<InputStream> holder = splitInputHolders.get(split);
    if (holder == null) {
        log.info("Creating new InputStream for split");
        holder = new StreamsHolder<InputStream>();
        final FileSystem fs = basePath.getFileSystem(configuration);
        if (!isCompressed()) {
            FSDataInputStream input = fs.open(split.getPath());
            input.seek(split.getStart());
            holder.setStream(input);
        } else {
            Class<?> clazz = ClassUtils.resolveClassName(codecInfo.getCodecClass(),
                    getClass().getClassLoader());

            if (!ClassUtils.isAssignable(SplittableCompressionCodec.class, clazz)) {
                throw new StorageException("Not a SplittableCompressionCodec");
            }

            FSDataInputStream winput = fs.open(split.getPath());

            CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz,
                    getConfiguration());
            Decompressor decompressor = CodecPool.getDecompressor(compressionCodec);

            long start = split.getStart();
            long end = start + split.getLength();
            log.info("SplitCompressionInputStream start=" + start + " end=" + end);
            SplitCompressionInputStream input = ((SplittableCompressionCodec) compressionCodec)
                    .createInputStream(winput, decompressor, start, end,
                            SplittableCompressionCodec.READ_MODE.BYBLOCK);

            holder.setWrappedStream(winput);
            holder.setStream(input);
        }
        splitInputHolders.put(split, holder);
    }
    return holder;
}

From source file:org.terrier.structures.indexing.singlepass.hadoop.FileCollectionRecordReader.java

License:Mozilla Public License

/** Opens a collection on the next file. */
@Override//from   w  ww  .  j av  a  2 s .  c o  m
protected Collection openCollectionSplit(int index) throws IOException {
    if (index >= ((CombineFileSplit) split.getSplit()).getNumPaths()) {
        //no more splits left to process
        return null;
    }
    Path file = ((CombineFileSplit) split.getSplit()).getPath(index);
    logger.info("Opening " + file);
    long offset = 0;//TODO populate from split?
    FileSystem fs = file.getFileSystem(config);

    //WT2G collection has incorrectly named extensions. Terrier can deal with this,
    //Hadoop cant
    CompressionCodec codec = compressionCodecs.getCodec(new Path(file.toString().replaceAll("\\.GZ$", ".gz")));

    length = fs.getFileStatus(file).getLen();
    FSDataInputStream _input = fs.open(file); //TODO: we could use utility.Files here if
    //no codec was found   
    InputStream internalInputStream = null;
    start = offset;

    if (codec != null) {
        start = 0;
        inputStream = new CountingInputStream(_input);
        internalInputStream = codec.createInputStream(inputStream);
    } else {
        if (start != 0) //TODO: start is always zero? 
        {
            --start;
            _input.seek(start);
        }
        internalInputStream = inputStream = new CountingInputStream(_input, start);
    }
    Collection rtr = CollectionFactory.loadCollection(
            ApplicationSetup.getProperty("trec.collection.class", "TRECCollection"),
            new Class[] { InputStream.class }, new Object[] { internalInputStream });

    if (rtr == null) {
        throw new IOException("Collection did not load properly");
    }
    return rtr;
}

From source file:org.zuinnote.hadoop.office.format.mapred.AbstractSpreadSheetDocumentRecordReader.java

License:Apache License

/**
* Creates an Abstract Record Reader for tables from various document formats
* @param split Split to use (assumed to be a file split)
* @param job Configuration:/*from w  ww. ja  v  a 2s  .c  om*/
* hadoopoffice.read.mimeType: Mimetype of the document
* hadoopoffice.read.locale: Locale of the document (e.g. needed for interpreting spreadsheets) in the BCP47 format (cf. https://tools.ietf.org/html/bcp47). If not specified then default system locale will be used.
* hadoopoffice.read.sheets: A ":" separated list of sheets to be read. If not specified then all sheets will be read one after the other
* hadoopoffice.read.linkedworkbooks: true if linkedworkbooks should be fetched. They must be in the same folder as the main workbook. Linked Workbooks will be processed together with the main workbook on one node and thus it should be avoided to have a lot of linked workbooks. It does only read the linked workbooks that are directly linked to the main workbook. Default: false
* hadoopoffice.read.ignoremissinglinkedworkbooks: true if missing linked workbooks should be ignored. Default: false
* hadoopoffice.read.security.crypt.password: if set then hadoopoffice will try to decrypt the file
* hadoopoffice.read.security.crypt.linkedworkbooks.*: if set then hadoopoffice will try to decrypt all the linked workbooks where a password has been specified. If no password is specified then it is assumed that the linked workbook is not encrypted. Example: Property key for file "linkedworkbook1.xlsx" is  "hadoopoffice.read.security.crypt.linkedworkbooks.linkedworkbook1.xslx". Value is the password. You must not include path or protocol information in the filename 
* hadoopoffice.read.filter.metadata: filters documents according to metadata. For example, hadoopoffice.read.filter.metadata.author will filter by author and the filter defined as value. Filtering is done by the parser and it is recommended that it supports regular expression for filtering, but this is up to the parser!
* @param reporter Reporter
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.office.format.common.parser.FormatNotUnderstoodException in case the document has an invalid format
*
*/
public AbstractSpreadSheetDocumentRecordReader(FileSplit split, JobConf job, Reporter reporter)
        throws IOException, FormatNotUnderstoodException, GeneralSecurityException {
    // parse configuration
    this.conf = job;
    this.reporter = reporter;
    this.reporter.setStatus("Initialize Configuration");
    this.hocr = new HadoopOfficeReadConfiguration(this.conf);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    this.hocr.setFileName(file.getName());
    this.readKeyStore(job);
    this.readTrustStore(job);
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    FSDataInputStream fileIn = file.getFileSystem(job).open(file);
    // open stream
    if (isCompressedInput()) { // decompress
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec");
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
            officeReader = new OfficeReader(cIn, this.hocr);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec");
            officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr);
            filePosition = fileIn;
        }
    } else {
        LOG.debug("Reading from an uncompressed file \"" + file + "\"");
        fileIn.seek(start);
        officeReader = new OfficeReader(fileIn, this.hocr);
        filePosition = fileIn;
    }
    this.reporter.setStatus("Parsing document");
    // initialize reader
    this.officeReader.parse();
    // read linked workbooks
    this.reporter.setStatus("Reading linked documents");
    if (this.hocr.getReadLinkedWorkbooks()) {
        // get current path
        Path currentPath = split.getPath();
        Path parentPath = currentPath.getParent();
        if (!"".equals(this.hocr.getLinkedWorkbookLocation())) {
            // use a custom location for linked workbooks
            parentPath = new Path(this.hocr.getLinkedWorkbookLocation());
        }

        // read linked workbook filenames
        List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks();
        this.currentHFR = new HadoopFileReader(job);
        for (String listItem : linkedWorkbookList) {
            LOG.info("Adding linked workbook \"" + listItem + "\"");
            String sanitizedListItem = new Path(listItem).getName();
            // read file from hadoop file
            Path currentFile = new Path(parentPath, sanitizedListItem);
            InputStream currentIn = this.currentHFR.openFile(currentFile);
            this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn,
                    this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem));
        }
    }
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.AbstractSpreadSheetDocumentRecordReader.java

License:Apache License

/**
* Initializes reader/* www.j  a  v  a2s  .  c  om*/
* @param split Split to use (assumed to be a file split)
* @param context context of the job
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws java.lang.InterruptedException in case of thread interruption
*
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    try {
        FileSplit fSplit = (FileSplit) split;
        // Initialize start and end of split
        start = fSplit.getStart();
        end = start + fSplit.getLength();
        final Path file = fSplit.getPath();
        codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
        this.hocr.setFileName(file.getName());
        this.readKeyStore(context.getConfiguration());
        this.readTrustStore(context.getConfiguration());
        FSDataInputStream fileIn = file.getFileSystem(conf).open(file);
        // open stream
        if (isCompressedInput()) { // decompress
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec");
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS);
                officeReader = new OfficeReader(cIn, this.hocr);
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn; // take pos from compressed stream
            } else {
                LOG.debug("Reading from a compressed file \"" + file
                        + "\" with non-splittable compression codec");
                officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr);
                filePosition = fileIn;
            }
        } else {
            LOG.debug("Reading from an uncompressed file \"" + file + "\"");
            fileIn.seek(start);
            officeReader = new OfficeReader(fileIn, this.hocr);
            filePosition = fileIn;
        }
        // initialize reader
        this.officeReader.parse();
        // read linked workbooks
        if (this.hocr.getReadLinkedWorkbooks()) {
            // get current path
            Path currentPath = fSplit.getPath();
            Path parentPath = currentPath.getParent();
            if (!"".equals(this.hocr.getLinkedWorkbookLocation())) {
                // use a custom location for linked workbooks
                parentPath = new Path(this.hocr.getLinkedWorkbookLocation());
            }
            // read linked workbook filenames
            List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks();
            LOG.debug(linkedWorkbookList.size());
            this.currentHFR = new HadoopFileReader(context.getConfiguration());
            for (String listItem : linkedWorkbookList) {
                LOG.info("Adding linked workbook \"" + listItem + "\"");
                String sanitizedListItem = new Path(listItem).getName();
                // read file from hadoop file
                Path currentFile = new Path(parentPath, sanitizedListItem);
                InputStream currentIn = this.currentHFR.openFile(currentFile);
                this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn,
                        this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem));
            }
        }
    } catch (FormatNotUnderstoodException fnue) {
        LOG.error(fnue);
        this.close();
        throw new InterruptedException();
    }
}

From source file:parquet.hadoop.ParquetFileReader.java

License:Apache License

/**
 * Reads the meta data block in the footer of the file
 * @param configuration/*  w w w.  j a va  2 s. c  om*/
 * @param file the parquet File
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 */
public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file)
        throws IOException {
    FileSystem fileSystem = file.getPath().getFileSystem(configuration);
    FSDataInputStream f = fileSystem.open(file.getPath());
    try {
        long l = file.getLen();
        if (Log.DEBUG)
            LOG.debug("File length " + l);
        int FOOTER_LENGTH_SIZE = 4;
        if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
            throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)");
        }
        long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length;
        if (Log.DEBUG)
            LOG.debug("reading footer index at " + footerLengthIndex);

        f.seek(footerLengthIndex);
        int footerLength = readIntLittleEndian(f);
        byte[] magic = new byte[MAGIC.length];
        f.readFully(magic);
        if (!Arrays.equals(MAGIC, magic)) {
            throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail "
                    + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
        }
        long footerIndex = footerLengthIndex - footerLength;
        if (Log.DEBUG)
            LOG.debug("read footer length: " + footerLength + ", footer index: " + footerIndex);
        if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
            throw new RuntimeException("corrupted file: the footer index is not within the file");
        }
        f.seek(footerIndex);
        return parquetMetadataConverter.readParquetMetadata(f);
    } finally {
        f.close();
    }
}