Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs.//from  w  ww.j  av  a 2s  .c o  m
 *
 * @param fs File system that contains the file.
 * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
 * filename.lzo.index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
        throw new IOException("Could not find codec for file " + lzoFile
                + " - you may need to add the LZO codec to your io.compression.codecs "
                + "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
        is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // Solely for reading the header
        codec.createInputStream(is, decompressor);
        int numCompressedChecksums = decompressor.getCompressedChecksumsCount();
        int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            // See LzopInputStream.getCompressedData
            boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
            int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums
                    : numDecompressedChecksums + numCompressedChecksums;
            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
        }
        // If we're here, indexing was successful.
        indexingSucceeded = true;
    } finally {
        // Close any open streams.
        if (is != null) {
            is.close();
        }

        if (os != null) {
            os.close();
        }

        if (!indexingSucceeded) {
            // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
            fs.delete(tmpOutputFile, false);
        } else {
            // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
            fs.rename(tmpOutputFile, outputFile);
        }
    }
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./*from   ww  w . j  a  va2s.  c om*/
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

public void readSequentialDirect() throws Exception {
    System.out.println("reading sequential file in direct mode " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    FileStatus status = fs.getFileStatus(path);
    FSDataInputStream instream = fs.open(path);
    ByteBuffer buf = ByteBuffer.allocateDirect(size);
    buf.clear();/*from   www. j  a v a2 s. co m*/
    double sumbytes = 0;
    double ops = 0;
    System.out.println("file capacity " + status.getLen());
    System.out.println("read size " + size);
    System.out.println("operations " + loop);

    long start = System.currentTimeMillis();
    while (ops < loop) {
        buf.clear();
        double ret = (double) instream.read(buf);
        if (ret > 0) {
            sumbytes = sumbytes + ret;
            ops = ops + 1.0;
        } else {
            ops = ops + 1.0;
            if (instream.getPos() == 0) {
                break;
            } else {
                instream.seek(0);
            }
        }
    }
    long end = System.currentTimeMillis();
    double executionTime = ((double) (end - start)) / 1000.0;
    double throughput = 0.0;
    double latency = 0.0;
    double sumbits = sumbytes * 8.0;
    if (executionTime > 0) {
        throughput = sumbits / executionTime / 1024.0 / 1024.0;
        latency = 1000000.0 * executionTime / ops;
    }
    System.out.println("execution time " + executionTime);
    System.out.println("ops " + ops);
    System.out.println("sumbytes " + sumbytes);
    System.out.println("throughput " + throughput);
    System.out.println("latency " + latency);
    System.out.println("closing stream");
    instream.close();
    fs.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

public void readSequentialHeap() throws Exception {
    System.out.println("reading sequential file in heap mode " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    FileStatus status = fs.getFileStatus(path);
    FSDataInputStream instream = fs.open(path);
    byte[] buf = new byte[size];
    double sumbytes = 0;
    double ops = 0;
    System.out.println("file capacity " + status.getLen());
    System.out.println("read size " + size);
    System.out.println("operations " + loop);

    long start = System.currentTimeMillis();
    while (ops < loop) {
        double ret = (double) this.read(instream, buf);
        if (ret > 0) {
            sumbytes = sumbytes + ret;//w  ww  .  j  a  v a  2 s. c  o m
            ops = ops + 1.0;
        } else {
            ops = ops + 1.0;
            if (instream.getPos() == 0) {
                break;
            } else {
                instream.seek(0);
            }
        }
    }
    long end = System.currentTimeMillis();
    double executionTime = ((double) (end - start)) / 1000.0;
    double throughput = 0.0;
    double latency = 0.0;
    double sumbits = sumbytes * 8.0;
    if (executionTime > 0) {
        throughput = sumbits / executionTime / 1024.0 / 1024.0;
        latency = 1000000.0 * executionTime / ops;
    }
    System.out.println("execution time " + executionTime);
    System.out.println("ops " + ops);
    System.out.println("sumbytes " + sumbytes);
    System.out.println("throughput " + throughput);
    System.out.println("latency " + latency);
    System.out.println("closing stream");
    instream.close();
    fs.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

public void readRandomDirect() throws Exception {
    System.out.println("reading random file in direct mode " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    FileStatus status = fs.getFileStatus(path);
    FSDataInputStream instream = fs.open(path);
    ByteBuffer buf = ByteBuffer.allocateDirect(size);
    buf.clear();//from  w  w w. j  a  v a  2s .  com
    double sumbytes = 0;
    double ops = 0;
    long _range = status.getLen() - ((long) buf.capacity());
    double range = (double) _range;
    Random random = new Random();

    System.out.println("file capacity " + status.getLen());
    System.out.println("read size " + size);
    System.out.println("operations " + loop);
    long start = System.currentTimeMillis();
    while (ops < loop) {
        buf.clear();
        double _offset = range * random.nextDouble();
        long offset = (long) _offset;
        instream.seek(offset);
        double ret = (double) instream.read(buf);
        if (ret > 0) {
            sumbytes = sumbytes + ret;
            ops = ops + 1.0;
        } else {
            break;
        }
    }
    long end = System.currentTimeMillis();
    double executionTime = ((double) (end - start)) / 1000.0;
    double throughput = 0.0;
    double latency = 0.0;
    double sumbits = sumbytes * 8.0;
    if (executionTime > 0) {
        throughput = sumbits / executionTime / 1024.0 / 1024.0;
        latency = 1000000.0 * executionTime / ops;
    }

    System.out.println("execution time " + executionTime);
    System.out.println("ops " + ops);
    System.out.println("sumbytes " + sumbytes);
    System.out.println("throughput " + throughput);
    System.out.println("latency " + latency);
    System.out.println("closing stream");
    instream.close();
    fs.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

public void readRandomHeap() throws Exception {
    System.out.println("reading random file in heap mode " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    FileStatus status = fs.getFileStatus(path);
    FSDataInputStream instream = fs.open(path);
    byte[] buf = new byte[size];
    double sumbytes = 0;
    double ops = 0;
    long _range = status.getLen() - ((long) buf.length);
    double range = (double) _range;
    Random random = new Random();

    System.out.println("file capacity " + status.getLen());
    System.out.println("read size " + size);
    System.out.println("operations " + loop);
    long start = System.currentTimeMillis();
    while (ops < loop) {
        double _offset = range * random.nextDouble();
        long offset = (long) _offset;
        instream.seek(offset);
        double ret = (double) this.read(instream, buf);
        if (ret > 0) {
            sumbytes = sumbytes + ret;//from w  ww  . j av a  2 s .  c  o m
            ops = ops + 1.0;
        } else {
            break;
        }
    }
    long end = System.currentTimeMillis();
    double executionTime = ((double) (end - start)) / 1000.0;
    double throughput = 0.0;
    double latency = 0.0;
    double sumbits = sumbytes * 8.0;
    if (executionTime > 0) {
        throughput = sumbits / executionTime / 1024.0 / 1024.0;
        latency = 1000000.0 * executionTime / ops;
    }

    System.out.println("execution time " + executionTime);
    System.out.println("ops " + ops);
    System.out.println("sumbytes " + sumbytes);
    System.out.println("throughput " + throughput);
    System.out.println("latency " + latency);
    System.out.println("closing stream");
    instream.close();
    fs.close();
}

From source file:com.intel.hibench.datagen.streaming.util.SourceFileReader.java

License:Apache License

static public BufferedReader getReader(Configuration dfsConf, String path, long offset) {
    BufferedReader reader = null;
    try {//from  ww  w  . j  a  v a2 s.c o  m
        Path pt = new Path(path);
        FileSystem fs = FileSystem.get(dfsConf);
        InputStreamReader isr;
        if (fs.isDirectory(pt)) {
            //give path is an directory
            isr = new InputStreamReader(openMultipleParts(fs, pt, offset));
        } else {
            //give path is an file
            FSDataInputStream inputStream = fs.open(pt);
            if (offset > 0) {
                inputStream.seek(offset);
            }
            isr = new InputStreamReader(inputStream);
        }

        reader = new BufferedReader(isr);
    } catch (IOException e) {
        System.err.println("Fail to get reader from path: " + path);
        e.printStackTrace();
    }
    return reader;
}

From source file:com.intel.hibench.datagen.streaming.util.SourceFileReader.java

License:Apache License

static private InputStream openMultipleParts(FileSystem fs, Path pt, long offset) throws IOException {

    System.out.println("opening all parts in path: " + pt + ", from offset: " + offset);
    // list all files in given path
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
        Path path = rit.next().getPath();

        // Only read those files start with "part-"
        if (path.getName().startsWith("part-")) {
            long fileSize = fs.getFileStatus(path).getLen();
            if (offset < fileSize) {
                FSDataInputStream inputStream = fs.open(path);
                if (offset > 0) {
                    inputStream.seek(offset);
                }/*from  w  w w  .  j  ava 2s  .c o m*/
                fileHandleList.add(inputStream);
            }
            offset -= fileSize;
        }
    }

    if (!fileHandleList.isEmpty()) {
        return new SequenceInputStream(fileHandleList.elements());
    } else {
        System.err.println("Error, no source file loaded. run genSeedDataset.sh first!");
        return null;
    }

}

From source file:com.intel.hibench.streambench.FileDataGenNew.java

License:Apache License

public BufferedReader loadDataFromFile(String filepath, long offset) {
    try {//from   w  w  w . j a v a 2 s . c o m
        Path pt = new Path(filepath);
        FileSystem fs = FileSystem.get(fsConf);
        InputStreamReader isr;
        if (fs.isDirectory(pt)) { // multiple parts
            isr = new InputStreamReader(OpenMultiplePartsWithOffset(fs, pt, offset));
        } else { // single file
            FSDataInputStream fileHandler = fs.open(pt);
            if (offset > 0)
                fileHandler.seek(offset);
            isr = new InputStreamReader(fileHandler);
        }

        BufferedReader reader = new BufferedReader(isr);
        if (offset > 0)
            reader.readLine(); // skip first line in case of seek
        return reader;
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}

From source file:com.intel.hibench.streambench.FileDataGenNew.java

License:Apache License

private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException {
    System.out.println("Opening files, path:" + pt + " offset:" + offset);
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
        Path path = rit.next().getPath();
        String filename = path.toString().substring(path.getParent().toString().length(),
                path.toString().length());

        if (filename.startsWith("/part-")) {
            long filesize = fs.getFileStatus(path).getLen();
            if (offset < filesize) {
                FSDataInputStream handle = fs.open(path);
                if (offset > 0) {
                    handle.seek(offset);
                }// w  w w .  j  a v a 2  s. co m
                fileHandleList.add(handle);
            }
            offset -= filesize;
        }
    }
    if (fileHandleList.size() == 1)
        return fileHandleList.get(0);
    else if (fileHandleList.size() > 1) {
        Enumeration<FSDataInputStream> enu = fileHandleList.elements();
        return new SequenceInputStream(enu);
    } else {
        System.err.println("Error, no source file loaded. run genSeedDataset.sh first!");
        return null;
    }
}