List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:org.broadinstitute.sting.gatk.hadoop.SplittingBAMIndexer.java
License:Open Source License
public void index(final Path file, Configuration cfg) throws IOException { FileSystem fs = file.getFileSystem(cfg); final FSDataInputStream fin = fs.open(file); fin.seek(0); final BlockCompressedInputStream in = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>(fin, fs.getFileStatus(file).getLen(), file)); String pstr = file.toString(); String nstr = pstr.replace(file.getName(), "_" + file.getName() + ".splitting-bai"); FSDataOutputStream fout = fs.create(new Path(nstr), true); final OutputStream out = new BufferedOutputStream(fout); final LongBuffer lb = byteBuffer.order(ByteOrder.BIG_ENDIAN).asLongBuffer(); skipToAlignmentList(in);/*from www. j a v a2 s .c o m*/ // Always write the first one to make sure it's not skipped lb.put(0, in.getFilePointer()); out.write(byteBuffer.array()); long prevPrint = in.getFilePointer() >> 16; for (int i = 0;;) { final PtrSkipPair pair = readAlignment(in); if (pair == null) break; if (++i == granularity) { i = 0; lb.put(0, pair.ptr); out.write(byteBuffer.array()); final long filePos = pair.ptr >> 16; if (filePos - prevPrint >= PRINT_EVERY) { System.out.print("-"); prevPrint = filePos; } } fullySkip(in, pair.skip); } lb.put(0, fs.getFileStatus(file).getLen() << 16); out.write(byteBuffer.array()); out.close(); in.close(); }
From source file:org.cloudgraph.mapreduce.GraphXmlRecordReader.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { // This InputSplit is a FileInputSplit FileSplit split = (FileSplit) inputSplit; this.context = context; this.configuration = context.getConfiguration(); this.getCounter = Counters.retrieveGetCounterWithStringsParams(context); this.rootNamespaceUri = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_URI); this.rootNamespacePrefix = configuration.get(GraphXmlInputFormat.ROOT_ELEM_NAMESPACE_PREFIX, "ns1"); this.unmarshalOptions = new DefaultOptions(this.rootNamespaceUri); this.unmarshalOptions.setRootNamespacePrefix(this.rootNamespacePrefix); this.unmarshalOptions.setValidate(false); this.unmarshalOptions.setFailOnValidationError(false); this.unmarshaler = new StreamUnmarshaller(this.unmarshalOptions, null); // Retrieve configuration, and Max allowed // bytes for a single record this.maxLineLength = configuration.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); // Split "S" is responsible for all records // starting from "start" and "end" positions start = split.getStart();// w w w.ja va2s.c om end = start + split.getLength(); // Retrieve file containing Split "S" final Path file = split.getPath(); FileSystem fs = file.getFileSystem(this.configuration); FSDataInputStream fileIn = fs.open(split.getPath()); // If Split "S" starts at byte 0, first line will be processed // If Split "S" does not start at byte 0, first line has been already // processed by "S-1" and therefore needs to be silently ignored boolean skipFirstLine = false; if (start != 0) { skipFirstLine = true; // Set the file pointer at "start - 1" position. // This is to make sure we won't miss any line // It could happen if "start" is located on a EOL --start; fileIn.seek(start); } in = new LineReader(fileIn, this.configuration); // If first line needs to be skipped, read first line // and stores its content to a dummy Text if (skipFirstLine) { Text dummy = new Text(); // Reset "start" to "start + line offset" start += in.readLine(dummy, 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } // Position is the actual start this.pos = start; }
From source file:org.commoncrawl.service.queryserver.query.DomainURLListQuery.java
License:Open Source License
private static void readPaginatedResults(final DatabaseIndexV2.MasterDatabaseIndex masterIndex, long domainId, FSDataInputStream inputStream, long length, String sortByField, int sortOrder, int pageNumber, int pageSize, QueryResult<URLFPV2, CrawlDatumAndMetadata> resultOut) throws IOException { // if descending sort order ... // take pageNumber * pageSize as starting point long offset = 0; long startPos = 0; long endPos = 0; // calculate total record count ... int totalRecordCount = (int) (length / FP_RECORD_SIZE); resultOut.getResults().clear();//from ww w .jav a2 s .c o m resultOut.setPageNumber(pageNumber); resultOut.setTotalRecordCount(totalRecordCount); // flip pr due to bug in how we sort pr if (sortByField.equals(SORT_BY_PR)) { if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) sortOrder = ClientQueryInfo.SortOrder.DESCENDING; else sortOrder = ClientQueryInfo.SortOrder.ASCENDING; } if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) { startPos = pageNumber * pageSize; endPos = Math.min(startPos + pageSize, totalRecordCount); offset = pageNumber * pageSize; } else { startPos = totalRecordCount - ((pageNumber + 1) * pageSize); endPos = startPos + pageSize; startPos = Math.max(0, startPos); offset = totalRecordCount - ((pageNumber + 1) * pageSize); } //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset); if (startPos < totalRecordCount) { //LOG.info("Seeking to Offset:" + startPos); inputStream.seek(startPos * FP_RECORD_SIZE); //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)"); for (long i = startPos; i < endPos; ++i) { URLFPV2 key = new URLFPV2(); key.setDomainHash(domainId); key.setUrlHash(inputStream.readLong()); // ok time to find this item in the master index ... CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata(); long timeStart = System.currentTimeMillis(); MetadataOut metadataOut = masterIndex.queryMetadataAndURLGivenFP(key); long timeEnd = System.currentTimeMillis(); //LOG.info("Metadata Retrieval for Index:"+ i + " took:" + (timeEnd - timeStart)); if (metadataOut == null) { LOG.error("Failed to Retrieve URL and Metadata for Domain:" + domainId + " FP:" + key.getUrlHash()); metadataObject.setUrl("NULL-DH(" + key.getDomainHash() + ")-FP(" + key.getUrlHash() + ")"); } else { metadataObject.setUrl(metadataOut.url.toString()); metadataObject.setStatus(metadataOut.fetchStatus); if (metadataOut.lastFetchTime > 0) { metadataObject.getMetadata().setLastFetchTimestamp(metadataOut.lastFetchTime); } metadataObject.getMetadata().setPageRank(metadataOut.pageRank); } if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) { resultOut.getResults().add(0, new QueryResultRecord<URLFPV2, CrawlDatumAndMetadata>(key, metadataObject)); } else { resultOut.getResults() .add(new QueryResultRecord<URLFPV2, CrawlDatumAndMetadata>(key, metadataObject)); } } } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private static void readPaginatedInlinkingDomainInfo(final DatabaseIndexV2.MasterDatabaseIndex masterIndex, FileSystem indexFileSystem, Path indexPath, Path detailPath, int sortOrder, int pageNumber, int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException { // if descending sort order ... // take pageNumber * pageSize as starting point long offset = 0; long startPos = 0; long endPos = 0; FSDataInputStream indexStream = indexFileSystem.open(indexPath); try {// w w w . j a va 2 s .co m // read in the total record count ... int totalRecordCount = indexStream.readInt(); LOG.info("***RecordCount:" + totalRecordCount + " Allocating Buffer Of:" + (totalRecordCount * 4) + " bytes. FileLength:" + indexFileSystem.getFileStatus(indexPath).getLen()); // read in index header data upfront byte indexHeaderData[] = new byte[totalRecordCount * 4]; // read it indexStream.readFully(indexHeaderData); // mark string start pos long detailStartPos = indexStream.getPos(); // initialize index header reader stream DataInputBuffer indexHeaderStream = new DataInputBuffer(); indexHeaderStream.reset(indexHeaderData, 0, indexHeaderData.length); resultOut.getResults().clear(); resultOut.setPageNumber(pageNumber); resultOut.setTotalRecordCount(totalRecordCount); if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) { startPos = pageNumber * pageSize; endPos = Math.min(startPos + pageSize, totalRecordCount); offset = pageNumber * pageSize; } else { startPos = totalRecordCount - ((pageNumber + 1) * pageSize); endPos = startPos + pageSize; startPos = Math.max(0, startPos); offset = totalRecordCount - ((pageNumber + 1) * pageSize); } //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset); if (startPos < totalRecordCount) { //LOG.info("Seeking to Offset:" + startPos); indexHeaderStream.skip(startPos * 4); //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)"); for (long i = startPos; i < endPos; ++i) { // read data offset ... int domainDataPos = indexHeaderStream.readInt(); // seek to it indexStream.seek(detailStartPos + domainDataPos); // read the detail data InlinkingDomainInfo domainInfo = new InlinkingDomainInfo(); domainInfo.readFields(indexStream); // ok extract name String domainName = domainInfo.getDomainName(); if (domainName.length() == 0) { //TODO: NEED TO TRACK THIS DOWN domainName = "<<OOPS-NULL>>"; } Text key = new Text(domainName); domainInfo.setFieldClean(InlinkingDomainInfo.Field_DOMAINNAME); if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) { resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, domainInfo)); } else { resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, domainInfo)); } } } } finally { indexStream.close(); } }
From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java
License:Open Source License
private static void readPaginatedResults(final DatabaseIndexV2.MasterDatabaseIndex masterIndex, FSDataInputStream inputStream, long length, int sortOrder, int pageNumber, int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException { // if descending sort order ... // take pageNumber * pageSize as starting point long offset = 0; long startPos = 0; long endPos = 0; // calculate total record count ... int totalRecordCount = (int) (length / FP_RECORD_SIZE); resultOut.getResults().clear();//from ww w . ja v a 2s . co m resultOut.setPageNumber(pageNumber); resultOut.setTotalRecordCount(totalRecordCount); if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) { startPos = pageNumber * pageSize; endPos = Math.min(startPos + pageSize, totalRecordCount); offset = pageNumber * pageSize; } else { startPos = totalRecordCount - ((pageNumber + 1) * pageSize); endPos = startPos + pageSize; startPos = Math.max(0, startPos); offset = totalRecordCount - ((pageNumber + 1) * pageSize); } //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset); if (startPos < totalRecordCount) { //LOG.info("Seeking to Offset:" + startPos); inputStream.seek(startPos * FP_RECORD_SIZE); //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)"); for (long i = startPos; i < endPos; ++i) { URLFPV2 key = new URLFPV2(); key.setDomainHash(inputStream.readLong()); key.setUrlHash(inputStream.readLong()); // ok time to find this item in the master index ... CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata(); MetadataOut metadataOut = masterIndex.queryMetadataAndURLGivenFP(key); if (metadataOut == null) { LOG.error("Failed to Retrieve URL and Metadata for Domain:" + key.getDomainHash() + " FP:" + key.getUrlHash()); metadataObject.setUrl("NULL-DH(" + key.getDomainHash() + ")-FP(" + key.getUrlHash() + ")"); } else { metadataObject.setUrl(metadataOut.url.toString()); metadataObject.setStatus(metadataOut.fetchStatus); if (metadataOut.lastFetchTime > 0) { metadataObject.getMetadata().setLastFetchTimestamp(metadataOut.lastFetchTime); } metadataObject.getMetadata().setPageRank(metadataOut.pageRank); } if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) { resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, metadataObject)); } else { resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, metadataObject)); } } } }
From source file:org.deepak.joins.CustomRecordReader.java
License:Apache License
public CustomRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); curSplit = split;//from w ww . ja va 2s . co m start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.godhuli.rhipe.RXLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();// w w w . ja va 2 s . com end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine((new RHText()).getText(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); // linecounter ++ ; } this.pos = start; }
From source file:org.hedera.util.SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { CompressionCodec codec = compressionCodecs.getCodec(path); FSDataInputStream din = fs.open(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec; SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); return new SeekableInputStream(cin); } else {/* ww w . jav a 2 s . co m*/ // non-splittable compression input stream // no seeking or offsetting is needed assert start == 0; CompressionInputStream cin = codec.createInputStream(din, decompressor); return new SeekableInputStream(cin, din); } } else { // non compression input stream // we seek to the start of the split din.seek(start); return new SeekableInputStream(din); } }
From source file:org.huahinframework.core.lib.input.SimpleRecordReader.java
License:Apache License
/** * {@inheritDoc}//from ww w . j a v a2s . c om */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } // skip first line and re-establish "start". if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.fileName = file.getName(); this.fileLength = fs.getFileStatus(file).getLen(); this.conf = context.getConfiguration(); this.pos = start; this.separator = conf.get(SimpleJob.SEPARATOR, StringUtil.COMMA); this.regex = conf.getBoolean(SimpleJob.SEPARATOR_REGEX, false); init(); }
From source file:org.jclouds.examples.blobstore.hdfs.io.HdfsPayloadSlicer.java
License:Apache License
protected Payload doSlice(final FSDataInputStream inputStream, final long offset, final long length) { return new InputStreamSupplierPayload(new InputSupplier<InputStream>() { public InputStream getInput() throws IOException { if (offset > 0) { try { inputStream.seek(offset); } catch (IOException e) { Closeables.closeQuietly(inputStream); throw e; }/* w w w . j a v a 2 s .com*/ } return new LimitInputStream(inputStream, length); } }); }