List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs.//from w ww.j av a 2s .c o m * * @param fs File system that contains the file. * @param lzoFile the lzo file to index. For filename.lzo, the created index file will be * filename.lzo.index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); if (null == codec) { throw new IOException("Could not find codec for file " + lzoFile + " - you may need to add the LZO codec to your io.compression.codecs " + "configuration in core-site.xml"); } ((Configurable) codec).setConf(conf); FSDataInputStream is = null; FSDataOutputStream os = null; Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX); Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX); // Track whether an exception was thrown or not, so we know to either // delete the tmp index file on failure, or rename it to the new index file on success. boolean indexingSucceeded = false; try { is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // Solely for reading the header codec.createInputStream(is, decompressor); int numCompressedChecksums = decompressor.getCompressedChecksumsCount(); int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } // See LzopInputStream.getCompressedData boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize); int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums; long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip)); } // If we're here, indexing was successful. indexingSucceeded = true; } finally { // Close any open streams. if (is != null) { is.close(); } if (os != null) { os.close(); } if (!indexingSucceeded) { // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves. fs.delete(tmpOutputFile, false); } else { // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index. fs.rename(tmpOutputFile, outputFile); } } }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./*from ww w . j a va2s. c om*/ * * @param fs * File system that contains the file. * @param lzoFile * the lzo file to index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); InputStream lzoIs = null; FSDataOutputStream os = null; Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX); Path tmpOutputFile = outputFile.suffix(".tmp"); try { FSDataInputStream is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // for reading the header lzoIs = codec.createInputStream(is, decompressor); int numChecksums = decompressor.getChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksums)); } } finally { if (lzoIs != null) { lzoIs.close(); } if (os != null) { os.close(); } } fs.rename(tmpOutputFile, outputFile); }
From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java
License:Apache License
public void readSequentialDirect() throws Exception { System.out.println("reading sequential file in direct mode " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus status = fs.getFileStatus(path); FSDataInputStream instream = fs.open(path); ByteBuffer buf = ByteBuffer.allocateDirect(size); buf.clear();/*from www. j a v a2 s. co m*/ double sumbytes = 0; double ops = 0; System.out.println("file capacity " + status.getLen()); System.out.println("read size " + size); System.out.println("operations " + loop); long start = System.currentTimeMillis(); while (ops < loop) { buf.clear(); double ret = (double) instream.read(buf); if (ret > 0) { sumbytes = sumbytes + ret; ops = ops + 1.0; } else { ops = ops + 1.0; if (instream.getPos() == 0) { break; } else { instream.seek(0); } } } long end = System.currentTimeMillis(); double executionTime = ((double) (end - start)) / 1000.0; double throughput = 0.0; double latency = 0.0; double sumbits = sumbytes * 8.0; if (executionTime > 0) { throughput = sumbits / executionTime / 1024.0 / 1024.0; latency = 1000000.0 * executionTime / ops; } System.out.println("execution time " + executionTime); System.out.println("ops " + ops); System.out.println("sumbytes " + sumbytes); System.out.println("throughput " + throughput); System.out.println("latency " + latency); System.out.println("closing stream"); instream.close(); fs.close(); }
From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java
License:Apache License
public void readSequentialHeap() throws Exception { System.out.println("reading sequential file in heap mode " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus status = fs.getFileStatus(path); FSDataInputStream instream = fs.open(path); byte[] buf = new byte[size]; double sumbytes = 0; double ops = 0; System.out.println("file capacity " + status.getLen()); System.out.println("read size " + size); System.out.println("operations " + loop); long start = System.currentTimeMillis(); while (ops < loop) { double ret = (double) this.read(instream, buf); if (ret > 0) { sumbytes = sumbytes + ret;//w ww . j a v a 2 s. c o m ops = ops + 1.0; } else { ops = ops + 1.0; if (instream.getPos() == 0) { break; } else { instream.seek(0); } } } long end = System.currentTimeMillis(); double executionTime = ((double) (end - start)) / 1000.0; double throughput = 0.0; double latency = 0.0; double sumbits = sumbytes * 8.0; if (executionTime > 0) { throughput = sumbits / executionTime / 1024.0 / 1024.0; latency = 1000000.0 * executionTime / ops; } System.out.println("execution time " + executionTime); System.out.println("ops " + ops); System.out.println("sumbytes " + sumbytes); System.out.println("throughput " + throughput); System.out.println("latency " + latency); System.out.println("closing stream"); instream.close(); fs.close(); }
From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java
License:Apache License
public void readRandomDirect() throws Exception { System.out.println("reading random file in direct mode " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus status = fs.getFileStatus(path); FSDataInputStream instream = fs.open(path); ByteBuffer buf = ByteBuffer.allocateDirect(size); buf.clear();//from w w w. j a v a 2s . com double sumbytes = 0; double ops = 0; long _range = status.getLen() - ((long) buf.capacity()); double range = (double) _range; Random random = new Random(); System.out.println("file capacity " + status.getLen()); System.out.println("read size " + size); System.out.println("operations " + loop); long start = System.currentTimeMillis(); while (ops < loop) { buf.clear(); double _offset = range * random.nextDouble(); long offset = (long) _offset; instream.seek(offset); double ret = (double) instream.read(buf); if (ret > 0) { sumbytes = sumbytes + ret; ops = ops + 1.0; } else { break; } } long end = System.currentTimeMillis(); double executionTime = ((double) (end - start)) / 1000.0; double throughput = 0.0; double latency = 0.0; double sumbits = sumbytes * 8.0; if (executionTime > 0) { throughput = sumbits / executionTime / 1024.0 / 1024.0; latency = 1000000.0 * executionTime / ops; } System.out.println("execution time " + executionTime); System.out.println("ops " + ops); System.out.println("sumbytes " + sumbytes); System.out.println("throughput " + throughput); System.out.println("latency " + latency); System.out.println("closing stream"); instream.close(); fs.close(); }
From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java
License:Apache License
public void readRandomHeap() throws Exception { System.out.println("reading random file in heap mode " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus status = fs.getFileStatus(path); FSDataInputStream instream = fs.open(path); byte[] buf = new byte[size]; double sumbytes = 0; double ops = 0; long _range = status.getLen() - ((long) buf.length); double range = (double) _range; Random random = new Random(); System.out.println("file capacity " + status.getLen()); System.out.println("read size " + size); System.out.println("operations " + loop); long start = System.currentTimeMillis(); while (ops < loop) { double _offset = range * random.nextDouble(); long offset = (long) _offset; instream.seek(offset); double ret = (double) this.read(instream, buf); if (ret > 0) { sumbytes = sumbytes + ret;//from w ww . j av a 2 s . c o m ops = ops + 1.0; } else { break; } } long end = System.currentTimeMillis(); double executionTime = ((double) (end - start)) / 1000.0; double throughput = 0.0; double latency = 0.0; double sumbits = sumbytes * 8.0; if (executionTime > 0) { throughput = sumbits / executionTime / 1024.0 / 1024.0; latency = 1000000.0 * executionTime / ops; } System.out.println("execution time " + executionTime); System.out.println("ops " + ops); System.out.println("sumbytes " + sumbytes); System.out.println("throughput " + throughput); System.out.println("latency " + latency); System.out.println("closing stream"); instream.close(); fs.close(); }
From source file:com.intel.hibench.datagen.streaming.util.SourceFileReader.java
License:Apache License
static public BufferedReader getReader(Configuration dfsConf, String path, long offset) { BufferedReader reader = null; try {//from ww w . j a v a2 s.c o m Path pt = new Path(path); FileSystem fs = FileSystem.get(dfsConf); InputStreamReader isr; if (fs.isDirectory(pt)) { //give path is an directory isr = new InputStreamReader(openMultipleParts(fs, pt, offset)); } else { //give path is an file FSDataInputStream inputStream = fs.open(pt); if (offset > 0) { inputStream.seek(offset); } isr = new InputStreamReader(inputStream); } reader = new BufferedReader(isr); } catch (IOException e) { System.err.println("Fail to get reader from path: " + path); e.printStackTrace(); } return reader; }
From source file:com.intel.hibench.datagen.streaming.util.SourceFileReader.java
License:Apache License
static private InputStream openMultipleParts(FileSystem fs, Path pt, long offset) throws IOException { System.out.println("opening all parts in path: " + pt + ", from offset: " + offset); // list all files in given path RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); // Only read those files start with "part-" if (path.getName().startsWith("part-")) { long fileSize = fs.getFileStatus(path).getLen(); if (offset < fileSize) { FSDataInputStream inputStream = fs.open(path); if (offset > 0) { inputStream.seek(offset); }/*from w w w . j ava 2s .c o m*/ fileHandleList.add(inputStream); } offset -= fileSize; } } if (!fileHandleList.isEmpty()) { return new SequenceInputStream(fileHandleList.elements()); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh first!"); return null; } }
From source file:com.intel.hibench.streambench.FileDataGenNew.java
License:Apache License
public BufferedReader loadDataFromFile(String filepath, long offset) { try {//from w w w . j a v a 2 s . c o m Path pt = new Path(filepath); FileSystem fs = FileSystem.get(fsConf); InputStreamReader isr; if (fs.isDirectory(pt)) { // multiple parts isr = new InputStreamReader(OpenMultiplePartsWithOffset(fs, pt, offset)); } else { // single file FSDataInputStream fileHandler = fs.open(pt); if (offset > 0) fileHandler.seek(offset); isr = new InputStreamReader(fileHandler); } BufferedReader reader = new BufferedReader(isr); if (offset > 0) reader.readLine(); // skip first line in case of seek return reader; } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; }
From source file:com.intel.hibench.streambench.FileDataGenNew.java
License:Apache License
private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException { System.out.println("Opening files, path:" + pt + " offset:" + offset); RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); String filename = path.toString().substring(path.getParent().toString().length(), path.toString().length()); if (filename.startsWith("/part-")) { long filesize = fs.getFileStatus(path).getLen(); if (offset < filesize) { FSDataInputStream handle = fs.open(path); if (offset > 0) { handle.seek(offset); }// w w w . j a v a 2 s. co m fileHandleList.add(handle); } offset -= filesize; } } if (fileHandleList.size() == 1) return fileHandleList.get(0); else if (fileHandleList.size() > 1) { Enumeration<FSDataInputStream> enu = fileHandleList.elements(); return new SequenceInputStream(enu); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh first!"); return null; } }