List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:org.mitre.bio.mapred.io.FastaRecordReader.java
License:Open Source License
public FastaRecordReader(FileSplit split, JobConf job) throws IOException { this.pushBackString = null; this.pushBackSize = 0; this.maxLineLength = job.getInt("io.file.buffer.size", // mapred.linereader.maxlength Integer.MAX_VALUE);// w w w .j av a 2s . c o m this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); this.end = Long.MAX_VALUE; } else { /** * From LineRecordReader, what is this doing? */ if (this.start != 0) { LOG.info("Skipping first line in split"); skipFirstLine = true; --this.start; fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } if (skipFirstLine) { /** * Skipping first line to re-established "start". */ this.start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java
License:Open Source License
public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;//from w w w . jav a 2 s .c o m } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.rassee.omniture.hadoop.mapreduce.OmnitureDataFileRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;//from w ww . j av a 2s . c om } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.seqdoop.hadoop_bam.VCFRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit spl, TaskAttemptContext ctx) throws IOException { final FileSplit split = (FileSplit) spl; this.length = split.getLength(); final Path file = split.getPath(); final FileSystem fs = file.getFileSystem(ContextUtil.getConfiguration(ctx)); final FSDataInputStream ins = fs.open(file); reader = new AsciiLineReader(ins); it = new AsciiLineReaderIterator(reader); final Object h = codec.readHeader(it); if (!(h instanceof FeatureCodecHeader) || !(((FeatureCodecHeader) h).getHeaderValue() instanceof VCFHeader)) throw new IOException("No VCF header found in " + file); header = (VCFHeader) ((FeatureCodecHeader) h).getHeaderValue(); contigDict.clear();/*from ww w. ja v a2s . c o m*/ int i = 0; for (final VCFContigHeaderLine contig : header.getContigLines()) contigDict.put(contig.getID(), i++); // Note that we create a new reader here, so reader.getPosition() is 0 at // start regardless of the value of start. Hence getProgress() and // nextKeyValue() don't need to use start at all. final long start = split.getStart(); if (start != 0) { ins.seek(start - 1); reader = new AsciiLineReader(ins); reader.readLine(); // NOTE: skip incomplete line! it = new AsciiLineReaderIterator(reader); } else { // it seems that newer versions of the reader peek ahead one more line from the input long current_pos = it.getPosition(); ins.seek(0); reader = new AsciiLineReader(ins); it = new AsciiLineReaderIterator(reader); while (it.hasNext() && it.getPosition() <= current_pos && it.peek().startsWith("#")) { it.next(); } if (!it.hasNext() || it.getPosition() > current_pos) throw new IOException("Empty VCF file " + file); } }
From source file:org.springframework.data.hadoop.fs.FsShell.java
License:Apache License
public Collection<String> text(String... uris) { Collection<String> texts = new PrettyPrintList<String>(new ListPrinter<String>() { @Override//from w ww. j ava 2 s . co m public String toString(String e) throws Exception { return e + "\n"; } }); for (String uri : uris) { InputStream in = null; FSDataInputStream i = null; try { Path srcPat = new Path(uri); FileSystem srcFs = getFS(srcPat); for (Path src : FileUtil.stat2Paths(srcFs.globStatus(srcPat), srcPat)) { Assert.isTrue(srcFs.isFile(src), "Source must be a file"); i = srcFs.open(src); switch (i.readShort()) { case 0x1f8b: // RFC 1952 i.seek(0); in = new GZIPInputStream(i); break; case 0x5345: // 'S' 'E' if (i.readByte() == 'Q') { i.close(); in = new TextRecordInputStream(src, srcFs, configuration); } break; default: in = i; break; } i.seek(0); texts.add(getContent(in)); } } catch (IOException ex) { throw new HadoopException("Cannot read " + uri + ";" + ex.getMessage(), ex); } finally { IOUtils.closeStream(in); IOUtils.closeStream(i); } } return texts; }
From source file:org.springframework.data.hadoop.store.AbstractStorage.java
License:Apache License
/** * Gets the input stream for input split. * /*w w w.ja v a 2 s . com*/ * @param split the split * @return the input stream * @throws IOException Signals that an I/O exception has occurred. */ protected synchronized StreamsHolder<InputStream> getInput(InputSplit split) throws IOException { StreamsHolder<InputStream> holder = splitInputHolders.get(split); if (holder == null) { log.info("Creating new InputStream for split"); holder = new StreamsHolder<InputStream>(); final FileSystem fs = basePath.getFileSystem(configuration); if (!isCompressed()) { FSDataInputStream input = fs.open(split.getPath()); input.seek(split.getStart()); holder.setStream(input); } else { Class<?> clazz = ClassUtils.resolveClassName(codecInfo.getCodecClass(), getClass().getClassLoader()); if (!ClassUtils.isAssignable(SplittableCompressionCodec.class, clazz)) { throw new StorageException("Not a SplittableCompressionCodec"); } FSDataInputStream winput = fs.open(split.getPath()); CompressionCodec compressionCodec = (CompressionCodec) ReflectionUtils.newInstance(clazz, getConfiguration()); Decompressor decompressor = CodecPool.getDecompressor(compressionCodec); long start = split.getStart(); long end = start + split.getLength(); log.info("SplitCompressionInputStream start=" + start + " end=" + end); SplitCompressionInputStream input = ((SplittableCompressionCodec) compressionCodec) .createInputStream(winput, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); holder.setWrappedStream(winput); holder.setStream(input); } splitInputHolders.put(split, holder); } return holder; }
From source file:org.terrier.structures.indexing.singlepass.hadoop.FileCollectionRecordReader.java
License:Mozilla Public License
/** Opens a collection on the next file. */ @Override//from w ww . j av a 2 s . c o m protected Collection openCollectionSplit(int index) throws IOException { if (index >= ((CombineFileSplit) split.getSplit()).getNumPaths()) { //no more splits left to process return null; } Path file = ((CombineFileSplit) split.getSplit()).getPath(index); logger.info("Opening " + file); long offset = 0;//TODO populate from split? FileSystem fs = file.getFileSystem(config); //WT2G collection has incorrectly named extensions. Terrier can deal with this, //Hadoop cant CompressionCodec codec = compressionCodecs.getCodec(new Path(file.toString().replaceAll("\\.GZ$", ".gz"))); length = fs.getFileStatus(file).getLen(); FSDataInputStream _input = fs.open(file); //TODO: we could use utility.Files here if //no codec was found InputStream internalInputStream = null; start = offset; if (codec != null) { start = 0; inputStream = new CountingInputStream(_input); internalInputStream = codec.createInputStream(inputStream); } else { if (start != 0) //TODO: start is always zero? { --start; _input.seek(start); } internalInputStream = inputStream = new CountingInputStream(_input, start); } Collection rtr = CollectionFactory.loadCollection( ApplicationSetup.getProperty("trec.collection.class", "TRECCollection"), new Class[] { InputStream.class }, new Object[] { internalInputStream }); if (rtr == null) { throw new IOException("Collection did not load properly"); } return rtr; }
From source file:org.zuinnote.hadoop.office.format.mapred.AbstractSpreadSheetDocumentRecordReader.java
License:Apache License
/** * Creates an Abstract Record Reader for tables from various document formats * @param split Split to use (assumed to be a file split) * @param job Configuration:/*from w ww. ja v a 2s .c om*/ * hadoopoffice.read.mimeType: Mimetype of the document * hadoopoffice.read.locale: Locale of the document (e.g. needed for interpreting spreadsheets) in the BCP47 format (cf. https://tools.ietf.org/html/bcp47). If not specified then default system locale will be used. * hadoopoffice.read.sheets: A ":" separated list of sheets to be read. If not specified then all sheets will be read one after the other * hadoopoffice.read.linkedworkbooks: true if linkedworkbooks should be fetched. They must be in the same folder as the main workbook. Linked Workbooks will be processed together with the main workbook on one node and thus it should be avoided to have a lot of linked workbooks. It does only read the linked workbooks that are directly linked to the main workbook. Default: false * hadoopoffice.read.ignoremissinglinkedworkbooks: true if missing linked workbooks should be ignored. Default: false * hadoopoffice.read.security.crypt.password: if set then hadoopoffice will try to decrypt the file * hadoopoffice.read.security.crypt.linkedworkbooks.*: if set then hadoopoffice will try to decrypt all the linked workbooks where a password has been specified. If no password is specified then it is assumed that the linked workbook is not encrypted. Example: Property key for file "linkedworkbook1.xlsx" is "hadoopoffice.read.security.crypt.linkedworkbooks.linkedworkbook1.xslx". Value is the password. You must not include path or protocol information in the filename * hadoopoffice.read.filter.metadata: filters documents according to metadata. For example, hadoopoffice.read.filter.metadata.author will filter by author and the filter defined as value. Filtering is done by the parser and it is recommended that it supports regular expression for filtering, but this is up to the parser! * @param reporter Reporter * * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop * @throws org.zuinnote.hadoop.office.format.common.parser.FormatNotUnderstoodException in case the document has an invalid format * */ public AbstractSpreadSheetDocumentRecordReader(FileSplit split, JobConf job, Reporter reporter) throws IOException, FormatNotUnderstoodException, GeneralSecurityException { // parse configuration this.conf = job; this.reporter = reporter; this.reporter.setStatus("Initialize Configuration"); this.hocr = new HadoopOfficeReadConfiguration(this.conf); // Initialize start and end of split start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); this.hocr.setFileName(file.getName()); this.readKeyStore(job); this.readTrustStore(job); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); FSDataInputStream fileIn = file.getFileSystem(job).open(file); // open stream if (isCompressedInput()) { // decompress decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec"); final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); officeReader = new OfficeReader(cIn, this.hocr); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec"); officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr); filePosition = fileIn; } } else { LOG.debug("Reading from an uncompressed file \"" + file + "\""); fileIn.seek(start); officeReader = new OfficeReader(fileIn, this.hocr); filePosition = fileIn; } this.reporter.setStatus("Parsing document"); // initialize reader this.officeReader.parse(); // read linked workbooks this.reporter.setStatus("Reading linked documents"); if (this.hocr.getReadLinkedWorkbooks()) { // get current path Path currentPath = split.getPath(); Path parentPath = currentPath.getParent(); if (!"".equals(this.hocr.getLinkedWorkbookLocation())) { // use a custom location for linked workbooks parentPath = new Path(this.hocr.getLinkedWorkbookLocation()); } // read linked workbook filenames List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks(); this.currentHFR = new HadoopFileReader(job); for (String listItem : linkedWorkbookList) { LOG.info("Adding linked workbook \"" + listItem + "\""); String sanitizedListItem = new Path(listItem).getName(); // read file from hadoop file Path currentFile = new Path(parentPath, sanitizedListItem); InputStream currentIn = this.currentHFR.openFile(currentFile); this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn, this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem)); } } }
From source file:org.zuinnote.hadoop.office.format.mapreduce.AbstractSpreadSheetDocumentRecordReader.java
License:Apache License
/** * Initializes reader/* www.j a v a2s . c om*/ * @param split Split to use (assumed to be a file split) * @param context context of the job * * * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop * @throws java.lang.InterruptedException in case of thread interruption * */ @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { try { FileSplit fSplit = (FileSplit) split; // Initialize start and end of split start = fSplit.getStart(); end = start + fSplit.getLength(); final Path file = fSplit.getPath(); codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); this.hocr.setFileName(file.getName()); this.readKeyStore(context.getConfiguration()); this.readTrustStore(context.getConfiguration()); FSDataInputStream fileIn = file.getFileSystem(conf).open(file); // open stream if (isCompressedInput()) { // decompress decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { LOG.debug("Reading from a compressed file \"" + file + "\" with splittable compression codec"); final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.CONTINUOUS); officeReader = new OfficeReader(cIn, this.hocr); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { LOG.debug("Reading from a compressed file \"" + file + "\" with non-splittable compression codec"); officeReader = new OfficeReader(codec.createInputStream(fileIn, decompressor), this.hocr); filePosition = fileIn; } } else { LOG.debug("Reading from an uncompressed file \"" + file + "\""); fileIn.seek(start); officeReader = new OfficeReader(fileIn, this.hocr); filePosition = fileIn; } // initialize reader this.officeReader.parse(); // read linked workbooks if (this.hocr.getReadLinkedWorkbooks()) { // get current path Path currentPath = fSplit.getPath(); Path parentPath = currentPath.getParent(); if (!"".equals(this.hocr.getLinkedWorkbookLocation())) { // use a custom location for linked workbooks parentPath = new Path(this.hocr.getLinkedWorkbookLocation()); } // read linked workbook filenames List<String> linkedWorkbookList = this.officeReader.getCurrentParser().getLinkedWorkbooks(); LOG.debug(linkedWorkbookList.size()); this.currentHFR = new HadoopFileReader(context.getConfiguration()); for (String listItem : linkedWorkbookList) { LOG.info("Adding linked workbook \"" + listItem + "\""); String sanitizedListItem = new Path(listItem).getName(); // read file from hadoop file Path currentFile = new Path(parentPath, sanitizedListItem); InputStream currentIn = this.currentHFR.openFile(currentFile); this.officeReader.getCurrentParser().addLinkedWorkbook(listItem, currentIn, this.hocr.getLinkedWBCredentialMap().get(sanitizedListItem)); } } } catch (FormatNotUnderstoodException fnue) { LOG.error(fnue); this.close(); throw new InterruptedException(); } }
From source file:parquet.hadoop.ParquetFileReader.java
License:Apache License
/** * Reads the meta data block in the footer of the file * @param configuration/* w w w. j a va 2 s. c om*/ * @param file the parquet File * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file */ public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file) throws IOException { FileSystem fileSystem = file.getPath().getFileSystem(configuration); FSDataInputStream f = fileSystem.open(file.getPath()); try { long l = file.getLen(); if (Log.DEBUG) LOG.debug("File length " + l); int FOOTER_LENGTH_SIZE = 4; if (l < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC throw new RuntimeException(file.getPath() + " is not a Parquet file (too small)"); } long footerLengthIndex = l - FOOTER_LENGTH_SIZE - MAGIC.length; if (Log.DEBUG) LOG.debug("reading footer index at " + footerLengthIndex); f.seek(footerLengthIndex); int footerLength = readIntLittleEndian(f); byte[] magic = new byte[MAGIC.length]; f.readFully(magic); if (!Arrays.equals(MAGIC, magic)) { throw new RuntimeException(file.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic)); } long footerIndex = footerLengthIndex - footerLength; if (Log.DEBUG) LOG.debug("read footer length: " + footerLength + ", footer index: " + footerIndex); if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) { throw new RuntimeException("corrupted file: the footer index is not within the file"); } f.seek(footerIndex); return parquetMetadataConverter.readParquetMetadata(f); } finally { f.close(); } }