Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.davidgildeh.hadoop.utils.FileUtils.java

License:Apache License

/**
 * Merges a list of input files in a directory to a single file under the 
 * outputpath with a specified filename//from   w ww .j  av a  2s .c  om
 * 
 * @param inputPath         The input directory containing all the input files. E.g. /input/dir/on/hdfs/
 * @param outputPath        The output path to output the file. E.g. /output/dir/on/hdfs/filename
 * @throws IOException
 */
public static void mergeFiles(String inputPath, String outputPath) throws IOException {

    Path inputDir = new Path(inputPath);
    Path outputFile = new Path(outputPath);
    FileSystem fileSystem = getFileSystem(outputFile);
    checkFileExists(fileSystem, inputDir);

    // Check the input path is a directory
    if (!fileSystem.getFileStatus(inputDir).isDir()) {
        LOG.error("Path '" + inputDir.toString() + "' is not a directory.");
        throw new IOException("Path '" + inputDir.toString() + "' is not a directory.");
    }

    // Create Output File
    OutputStream out = fileSystem.create(outputFile);

    try {

        FileStatus contents[] = fileSystem.listStatus(inputDir);

        // Loop through all files in directory and merge them into one file
        for (int i = 0; i < contents.length; i++) {

            if (!contents[i].isDir()) {

                InputStream in = fileSystem.open(contents[i].getPath());
                try {
                    IOUtils.copyBytes(in, out, fileSystem.getConf(), false);
                } finally {
                    in.close();
                }
            }
        }

    } finally {
        out.close();
        fileSystem.close();
        LOG.info("Merged input files from '" + inputPath + "' to '" + outputPath + "'");
    }
}

From source file:com.dianping.cat.hadoop.hdfs.MessageBlockReader.java

License:Open Source License

public MessageBlockReader(FileSystem fs, Path basePath, String dataFile) throws IOException {
    m_indexFile = fs.open(new Path(basePath, dataFile + ".idx"));
    m_dataFile = fs.open(new Path(basePath, dataFile));
}

From source file:com.dianping.cat.hadoop.hdfs.MessageBlockReader.java

License:Open Source License

public MessageBlockReader(FileSystem fs, String dataFile) throws IOException {
    m_indexFile = fs.open(new Path(dataFile + ".idx"));
    m_dataFile = fs.open(new Path(dataFile));
}

From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart());
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();//from www. j a  v a 2s  .co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    key = new Text(FileToCDRType.getTypeByPath(file.getName()));
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java

License:Open Source License

public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException {
    LOG.warn("split start: " + split.getStart());
    LOG.warn("split length: " + split.getLength());
    String[] locs = split.getLocations();
    for (String loc : locs) {
        LOG.warn("location: " + loc);
    }// www .j  a v  a  2s .c o m
    start = split.getStart();
    end = start + split.getLength();
    LOG.warn("split end: " + end);
    final Path file = split.getPath();
    LOG.warn("file: " + file.getName());
    LOG.warn("INT split start: " + (int) split.getStart());
    LOG.warn("INT split length: " + (int) split.getLength());
    LOG.warn("INT split end: " + (int) end);

    FileSystem fs = file.getFileSystem(conf);
    codecFactory = new CompressionCodecFactory(conf);
    final CompressionCodec codec = codecFactory.getCodec(file);
    LOG.warn("codec: " + codec.toString());
    LOG.warn("config: " + conf.toString());
    if (codec == null) {
        throw new IOException("No LZO codec found, cannot run.");
    }

    // Open the file and seek to the next split.
    fileIn = fs.open(file);
    // Create input stream and read the file header.
    in = new LineReader(codec.createInputStream(fileIn), conf);
    if (start != 0) {
        fileIn.seek(start);
        LOG.warn("fileIn position: " + fileIn.getPos());
        LOG.warn("buffer size: " + conf.get("io.file.buffer.size"));

        // Read and ignore the first line.
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    pos = start;
}

From source file:com.epam.catgenome.manager.bam.BamHelper.java

License:Open Source License

private SamInputResource getHDFSIndex(SamInputResource samInputResource, BiologicalDataItem indexFile)
        throws IOException {
    URI uriIndex = URI.create(indexFile.getPath());
    Configuration conf = new Configuration();
    FileSystem fileBam = FileSystem.get(uriIndex, conf);
    FSDataInputStream indexStream = fileBam.open(new Path(uriIndex));
    return samInputResource.index(new HdfsSeekableInputStream(indexStream));
}

From source file:com.epam.catgenome.manager.bam.BamHelper.java

License:Open Source License

@NotNull
private SamInputResource getHDFSSamInputResource(BamFile bamFile) throws IOException {
    final URI uriBam = URI.create(bamFile.getPath());
    final Configuration conf = new Configuration();
    final FileSystem fileBam = FileSystem.get(uriBam, conf);
    final FSDataInputStream inBam = fileBam.open(new Path(uriBam));
    return SamInputResource.of(new HdfsSeekableInputStream(inBam));
}

From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java

License:Apache License

void openFile() throws IOException {
    start = split.getStart();/*  w w w . j a  v  a 2s.c  o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
            + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    if (file.getName().endsWith(".zip")) {
        LOG.info("use ZipInputStream read file " + split.getPath());
        ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
        in = new LineReader(zin, job);
        filePosition = fileIn;
        codec = new GzipCodec();
        return;
    }
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn,
            // decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        String filename = file.getName();
        if (filename.endsWith(".tar")) {
            in = new LineReader(new TarInputStream(fileIn), job);
        } else {
            in = new LineReader(fileIn, job);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from   ww w .  j  a  va2 s  .co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn, decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.examples.ch02.HdfsReader_Ex_2.java

public int run(String[] args) throws Exception {
    Path inputPath = new Path(args[0]);
    String localOutputPath = args[1];
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    InputStream is = fs.open(inputPath);
    OutputStream os = new BufferedOutputStream(new FileOutputStream(localOutputPath));
    IOUtils.copyBytes(is, os, conf);//  w  w  w .j av a  2  s .  co m
    return 0;
}