Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static int getNumTerms(Configuration conf, Path dictionaryPath) throws IOException {
    FileSystem fs = dictionaryPath.getFileSystem(conf);
    Text key = new Text();
    IntWritable value = new IntWritable();
    int maxTermId = -1;
    for (FileStatus stat : fs.globStatus(dictionaryPath)) {
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, stat.getPath(), conf);
        while (reader.next(key, value)) {
            maxTermId = Math.max(maxTermId, value.get());
        }/*  w ww.j  a v  a 2 s  .c  o m*/
    }
    return maxTermId + 1;
}

From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java

License:Apache License

private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException {
    Path vectorPath = new Path(vectorPathString);
    FileSystem fs = vectorPath.getFileSystem(conf);
    List<Path> subPaths = Lists.newArrayList();
    if (fs.isFile(vectorPath)) {
        subPaths.add(vectorPath);/*  ww w.  ja  va 2  s .co m*/
    } else {
        for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
            subPaths.add(fileStatus.getPath());
        }
    }
    List<Vector> vectorList = Lists.newArrayList();
    for (Path subPath : subPaths) {
        for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(
                subPath, true, conf)) {
            vectorList.add(record.getSecond().get());
        }
    }
    int numRows = vectorList.size();
    int numCols = vectorList.get(0).size();
    return new SparseRowMatrix(numRows, numCols, vectorList.toArray(new Vector[vectorList.size()]), true,
            vectorList.get(0).isSequentialAccess());
}

From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java

License:Apache License

void openFile() throws IOException {
    start = split.getStart();/*from   w  ww.ja v  a  2  s . c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
            + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    if (file.getName().endsWith(".zip")) {
        LOG.info("use ZipInputStream read file " + split.getPath());
        ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
        in = new LineReader(zin, job);
        filePosition = fileIn;
        codec = new GzipCodec();
        return;
    }
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn,
            // decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        String filename = file.getName();
        if (filename.endsWith(".tar")) {
            in = new LineReader(new TarInputStream(fileIn), job);
        } else {
            in = new LineReader(fileIn, job);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from ww  w  .j a  v a  2 s. c o  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn, decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new LineReader(fileIn, job);
        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:com.facebook.hive.orc.FileDump.java

License:Open Source License

private static void processFile(String filename, Configuration conf) throws IOException {
    final Path path = new Path(filename);
    ReaderWriterProfiler.setProfilerOptions(conf);
    System.out.println("Structure for " + filename);

    final Reader reader = OrcFile.createReader(path.getFileSystem(conf), path, conf);
    final RecordReaderImpl rows = (RecordReaderImpl) reader.rows(null);
    System.out.println("Rows: " + reader.getNumberOfRows());

    printCompressionInformation(reader);
    System.out.println("Raw data size: " + reader.getRawDataSize());
    System.out.println("Type: " + reader.getObjectInspector().getTypeName());

    printColumnStatistics(reader);/*from   w ww . ja  v  a 2  s.  c om*/
    printStripeInformation(reader, rows);
}

From source file:com.facebook.hive.orc.OrcInputFormat.java

License:Open Source License

@Override
public RecordReader<NullWritable, OrcLazyRow> getRecordReader(InputSplit inputSplit, JobConf conf,
        Reporter reporter) throws IOException {
    ReaderWriterProfiler.setProfilerOptions(conf);
    FileSplit fileSplit = (FileSplit) inputSplit;
    Path path = fileSplit.getPath();
    FileSystem fs = path.getFileSystem(conf);
    reporter.setStatus(fileSplit.toString());

    return new OrcRecordReader(OrcFile.createReader(fs, path, conf), conf, fileSplit.getStart(),
            fileSplit.getLength());/*from   www. ja  v a2  s .c  om*/
}

From source file:com.facebook.hive.orc.OrcOutputFormat.java

License:Open Source License

@Override
public RecordWriter<NullWritable, OrcSerdeRow> getRecordWriter(FileSystem fileSystem, JobConf conf, String name,
        Progressable reporter) throws IOException {
    ReaderWriterProfiler.setProfilerOptions(conf);

    // To be compatible with older file formats like Sequence and RC
    // Only works if mapred.work.output.dir is set in the conf
    Path workOutputPath = FileOutputFormat.getWorkOutputPath(conf);
    Path outputPath = workOutputPath == null ? new Path(name) : new Path(workOutputPath, name);

    if (fileSystem == null && workOutputPath != null) {
        fileSystem = workOutputPath.getFileSystem(conf);
    }/*ww  w. j  a v a  2  s . c  o  m*/

    return new OrcRecordWriter(fileSystem, outputPath, conf,
            OrcConf.ConfVars.HIVE_ORC_STRIPE_SIZE.defaultLongVal,
            OrcConf.ConfVars.HIVE_ORC_COMPRESSION.defaultVal,
            OrcConf.ConfVars.HIVE_ORC_COMPRESSION_BLOCK_SIZE.defaultIntVal,
            OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE.defaultIntVal);
}

From source file:com.facebook.hive.orc.OrcOutputFormat.java

License:Open Source License

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf conf, Path path,
        Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        Progressable reporter) throws IOException {
    ReaderWriterProfiler.setProfilerOptions(conf);
    String stripeSizeStr = tableProperties.getProperty(OrcFile.STRIPE_SIZE);
    long stripeSize;
    if (stripeSizeStr != null) {
        stripeSize = Long.valueOf(stripeSizeStr);
    } else {//from  www .j  av  a2s . co  m
        stripeSize = OrcConf.getLongVar(conf, OrcConf.ConfVars.HIVE_ORC_STRIPE_SIZE);
    }

    String compression = tableProperties.getProperty(OrcFile.COMPRESSION);
    if (compression == null) {
        compression = OrcConf.getVar(conf, OrcConf.ConfVars.HIVE_ORC_COMPRESSION);
    }

    String compressionSizeStr = tableProperties.getProperty(OrcFile.COMPRESSION_BLOCK_SIZE);
    int compressionSize;
    if (compressionSizeStr != null) {
        compressionSize = Integer.valueOf(compressionSizeStr);
    } else {
        compressionSize = OrcConf.getIntVar(conf, OrcConf.ConfVars.HIVE_ORC_COMPRESSION_BLOCK_SIZE);
    }

    String rowIndexStrideStr = tableProperties.getProperty(OrcFile.ROW_INDEX_STRIDE);
    int rowIndexStride;
    if (rowIndexStrideStr != null) {
        rowIndexStride = Integer.valueOf(rowIndexStrideStr);
    } else {
        rowIndexStride = OrcConf.getIntVar(conf, OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE);
    }

    String enableIndexesStr = tableProperties.getProperty(OrcFile.ENABLE_INDEXES);
    boolean enableIndexes;
    if (enableIndexesStr != null) {
        enableIndexes = Boolean.valueOf(enableIndexesStr);
    } else {
        enableIndexes = OrcConf.getBoolVar(conf, OrcConf.ConfVars.HIVE_ORC_CREATE_INDEX);
    }

    if (!enableIndexes) {
        rowIndexStride = 0;
    }

    return new OrcRecordWriter(path.getFileSystem(conf), path, conf, stripeSize, compression, compressionSize,
            rowIndexStride);
}

From source file:com.facebook.hiveio.common.HadoopUtils.java

License:Apache License

/**
 * Delete output directory for this job//from  w  w  w  . j  av  a 2s  .  c o m
 * @param conf Configuration to use
 * @throws IOException I/O errors
 */
public static void deleteOutputDir(Configuration conf) throws IOException {
    Path outputPath = getOutputPath(conf);
    outputPath.getFileSystem(conf).delete(outputPath, true);
}

From source file:com.facebook.hiveio.common.HadoopUtils.java

License:Apache License

/**
 * Set worker output directory/*w w w  .  ja va  2 s  .  c  om*/
 * @param context Task context
 * @throws IOException I/O errors
 */
public static void setWorkOutputDir(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String outputPath = getOutputDir(conf);
    // we need to do this to get the task path and set it for mapred
    // implementation since it can't be done automatically because of
    // mapreduce->mapred abstraction
    if (outputPath != null) {
        FileOutputCommitter foc = new FileOutputCommitter(getOutputPath(conf), context);
        Path path = foc.getWorkPath();
        FileSystem fs = path.getFileSystem(conf);
        fs.mkdirs(path);
        conf.set("mapred.work.output.dir", path.toString());
        LOG.info("Setting mapred.work.output.dir to {}", path.toString());
    }
}