Example usage for org.apache.hadoop.fs FileSystem getConf

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getConf.

Prototype

@Override
    public Configuration getConf()

Source Link

Usage

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./*from   ww w. j ava  2 s .  c o m*/
 *
 * @param fs File system that contains the file.
 * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
 * filename.lzo.index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
        throw new IOException("Could not find codec for file " + lzoFile
                + " - you may need to add the LZO codec to your io.compression.codecs "
                + "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
        is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // Solely for reading the header
        codec.createInputStream(is, decompressor);
        int numCompressedChecksums = decompressor.getCompressedChecksumsCount();
        int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            // See LzopInputStream.getCompressedData
            boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
            int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums
                    : numDecompressedChecksums + numCompressedChecksums;
            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
        }
        // If we're here, indexing was successful.
        indexingSucceeded = true;
    } finally {
        // Close any open streams.
        if (is != null) {
            is.close();
        }

        if (os != null) {
            os.close();
        }

        if (!indexingSucceeded) {
            // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
            fs.delete(tmpOutputFile, false);
        } else {
            // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
            fs.rename(tmpOutputFile, outputFile);
        }
    }
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs./*from  w w w .  j a  va 2  s . c  o m*/
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java

License:Apache License

/**
 * Copies data from the given input stream to an HDFS file at the given path. This method will close the input stream.
 *//*  w ww.j  a  v  a 2s. c o m*/
protected final void copyStreamToHdfs(InputStream resource, String hdfsDestFileName) throws IOException {
    FileSystem fs = getFileSystem();

    FSDataOutputStream os = fs.create(new Path(hdfsDestFileName), false);

    IOUtils.copyBytes(resource, os, fs.getConf(), true);
}

From source file:com.lithium.flow.filer.HdfsFiler.java

License:Apache License

public HdfsFiler(@Nonnull FileSystem fileSystem) {
    this.fileSystem = checkNotNull(fileSystem);
    overwrite = fileSystem.getConf().getBoolean("overwrite", true);
}

From source file:com.mellanox.r4h.TestReadWhileWriting.java

License:Apache License

/** Test reading while writing. */
@Test/*from  www . j ava 2s . c o m*/
public void pipeline_02_03() throws Exception {
    final Configuration conf = new HdfsConfiguration();
    conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);

    // create cluster
    final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
    try {
        //change the lease limits.
        cluster.setLeasePeriod(SOFT_LEASE_LIMIT, HARD_LEASE_LIMIT);

        //wait for the cluster
        cluster.waitActive();
        final FileSystem fs = cluster.getFileSystem();
        final Path p = new Path(DIR, "file1");
        final int half = BLOCK_SIZE / 2;

        //a. On Machine M1, Create file. Write half block of data.
        //   Invoke DFSOutputStream.hflush() on the dfs file handle.
        //   Do not close file yet.
        {
            final FSDataOutputStream out = fs.create(p, true,
                    fs.getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), (short) 3,
                    BLOCK_SIZE);
            write(out, 0, half);

            //hflush
            ((DFSOutputStream) out.getWrappedStream()).hflush();
        }

        //b. On another machine M2, open file and verify that the half-block
        //   of data can be read successfully.
        checkFile(p, half, conf);
        MiniDFSClusterBridge.getAppendTestUtilLOG().info("leasechecker.interruptAndJoin()");
        ((DistributedFileSystem) fs).dfs.getLeaseRenewer().interruptAndJoin();

        //c. On M1, append another half block of data.  Close file on M1.
        {
            //sleep to let the lease is expired.
            Thread.sleep(2 * SOFT_LEASE_LIMIT);

            final UserGroupInformation current = UserGroupInformation.getCurrentUser();
            final UserGroupInformation ugi = UserGroupInformation
                    .createUserForTesting(current.getShortUserName() + "x", new String[] { "supergroup" });
            final DistributedFileSystem dfs = ugi.doAs(new PrivilegedExceptionAction<DistributedFileSystem>() {
                @Override
                public DistributedFileSystem run() throws Exception {
                    return (DistributedFileSystem) FileSystem.newInstance(conf);
                }
            });
            final FSDataOutputStream out = append(dfs, p);
            write(out, 0, half);
            out.close();
        }

        //d. On M2, open file and read 1 block of data from it. Close file.
        checkFile(p, 2 * half, conf);
    } finally {
        cluster.shutdown();
    }
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Configure this instance based on the command-line arguments contained within provided array.
 * Calls {@link #validate()} to ensure consistency of configuration.
 *
 * @return true if the arguments were parsed successfully and execution should proceed.
 * @throws Exception if there is a problem parsing the command-line arguments or the particular
 *                   combination would violate class invariants.
 *///from  w  ww . j a  v  a2 s.  co  m
private boolean parseArgs(String[] args) throws Exception {

    addInputOption();
    addOption("trainingOutput", "tr", "The training data output directory", false);
    addOption("testOutput", "te", "The test data output directory", false);
    addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
    addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
    addOption("splitLocation", "sl",
            "Location for start of test data expressed as a percentage of the input file "
                    + "size (0=start, 50=middle, 100=end",
            false);
    addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
    addOption("randomSelectionPct", "rp",
            "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false);
    addOption("charset", "c",
            "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)",
            false);
    addOption(buildOption("sequenceFiles", "seq",
            "Set if the input files are sequence files.  Default is false", false, false, "false"));
    addOption(DefaultOptionCreator.methodOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());
    //TODO: extend this to sequential mode
    addOption("keepPct", "k",
            "The percentage of total data to keep in map-reduce mode, the rest will be ignored.  "
                    + "Default is 100%",
            false);
    addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);

    if (parseArguments(args) == null) {
        return false;
    }

    try {
        inputDirectory = getInputPath();

        useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION)
                .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD);

        if (useMapRed) {
            if (!hasOption("randomSelectionPct")) {
                throw new OptionException(getCLIOption("randomSelectionPct"),
                        "must set randomSelectionPct when mapRed option is used");
            }
            if (!hasOption("mapRedOutputDir")) {
                throw new OptionException(getCLIOption("mapRedOutputDir"),
                        "mapRedOutputDir must be set when mapRed option is used");
            }
            mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
            if (hasOption("keepPct")) {
                keepPct = Integer.parseInt(getOption("keepPct"));
            }
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(getConf(), mapRedOutputDirectory);
            }
        } else {
            if (!hasOption("trainingOutput") || !hasOption("testOutput")) {
                throw new OptionException(getCLIOption("trainingOutput"),
                        "trainingOutput and testOutput must be set if mapRed option is not used");
            }
            if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct")
                    && !hasOption("randomSelectionSize")) {
                throw new OptionException(getCLIOption("testSplitSize"),
                        "must set one of test split size/percentage or randomSelectionSize/percentage");
            }

            trainingOutputDirectory = new Path(getOption("trainingOutput"));
            testOutputDirectory = new Path(getOption("testOutput"));
            FileSystem fs = trainingOutputDirectory.getFileSystem(getConf());
            if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
                HadoopUtil.delete(fs.getConf(), trainingOutputDirectory);
                HadoopUtil.delete(fs.getConf(), testOutputDirectory);
            }
            fs.mkdirs(trainingOutputDirectory);
            fs.mkdirs(testOutputDirectory);
        }

        if (hasOption("charset")) {
            charset = Charset.forName(getOption("charset"));
        }

        if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
            throw new OptionException(getCLIOption("testSplitPct"),
                    "must have either split size or split percentage " + "option, not BOTH");
        }

        if (hasOption("testSplitSize")) {
            setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
        }

        if (hasOption("testSplitPct")) {
            setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
        }

        if (hasOption("splitLocation")) {
            setSplitLocation(Integer.parseInt(getOption("splitLocation")));
        }

        if (hasOption("randomSelectionSize")) {
            setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
        }

        if (hasOption("randomSelectionPct")) {
            setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
        }

        useSequence = hasOption("sequenceFiles");

    } catch (OptionException e) {
        log.error("Command-line option Exception", e);
        CommandLineUtil.printHelp(getGroup());
        return false;
    }

    validate();
    return true;
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Perform a split on the specified input file. Results will be written to files of the same name in the specified
 * training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *///  w w  w . j a v  a  2s  .c o m
public void splitFile(Path inputFile) throws IOException {
    Configuration conf = getConf();
    FileSystem fs = inputFile.getFileSystem(conf);
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    }
    if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
        }
        log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(),
                testSplitSize, testRandomSelectionPct);
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
            log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize,
                    testSplitPct);
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}", inputFile.getName(),
                    testSplitStart, splitLocation);
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if (lineCount - testSplitSize < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    inputFile, testSplitSize, lineCount - testSplitSize);
        }
    }
    int trainCount = 0;
    int testCount = 0;
    if (!useSequence) {
        BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
        Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
        Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

        try {

            String line;
            int pos = 0;
            while ((line = reader.readLine()) != null) {
                pos++;

                Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                writer.write(line);
                writer.write('\n');
            }

        } finally {
            Closeables.close(reader, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    } else {
        SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>(
                inputFile, false, fs.getConf());
        SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        try {

            int pos = 0;
            while (iterator.hasNext()) {
                pos++;
                SequenceFile.Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                Pair<Writable, Writable> pair = iterator.next();
                writer.append(pair.getFirst(), pair.getSecond());
            }

        } finally {
            Closeables.close(iterator, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    }
    log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount,
            trainCount, testCount, testSplitStart);

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.ning.metrics.action.hdfs.data.RowFileContentsIteratorFactory.java

License:Apache License

public Iterator<Row> build(final FileSystem fs, final Path path, final boolean raw) throws IOException {
    try {//from w w w .j  a va 2  s. c o m
        return new RowSequenceFileContentsIterator(path.toUri().getPath(), rowParser, registrar,
                new SequenceFile.Reader(fs, path, fs.getConf()), raw);
    } catch (IOException e) {
        // Not a Sequence file?
        final FSDataInputStream input = fs.open(path);

        return new RowTextFileContentsIterator(path.toUri().getPath(), rowParser, registrar, input, raw);
    }
}

From source file:com.ning.metrics.collector.healthchecks.HadoopHealthCheck.java

License:Apache License

@Override
public Result check() {
    try {/*from w ww  .  j  a  v  a 2  s  . c o  m*/
        final FileSystem fileSystem = fsAccess.get(0); // No exponential backoff, fail early
        final Configuration fileSystemConf = fileSystem.getConf();
        final StringBuilder builder = new StringBuilder();

        for (final String prop : HADOOP_PROPERTIES) {
            builder.append(String.format("%s: %s, ", prop, fileSystemConf.get(prop)));

        }

        return Result.healthy(builder.toString());
    } catch (Exception e) {
        return Result.healthy("Exception when trying to access Hadoop");
    }
}

From source file:com.ricemap.spateDB.operations.Repartition.java

License:Apache License

public static <S extends Shape> CellInfo[] packInPrisms(FileSystem fs, Path[] files, FileSystem outFileSystem,
        Path outFile, long blocksize, S stockShape) throws IOException {
    final Vector<Point3d> sample = new Vector<Point3d>();

    double sample_ratio = outFileSystem.getConf().getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
    long sample_size = outFileSystem.getConf().getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

    // 24 is the estimated size in bytes needed to store each sample point 
    long sample_count = sample_size / 24;

    LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
    ResultCollector<Point3d> resultCollector = new ResultCollector<Point3d>() {
        @Override/*from   w  ww  .  ja  v  a 2s.c o  m*/
        public void collect(Point3d value) {
            sample.add(value.clone());
        }
    };
    Sampler.sampleWithRatio(fs, files, sample_ratio, sample_count, System.currentTimeMillis(), resultCollector,
            stockShape, new Point3d());
    LOG.info("Finished reading a sample of size: " + sample.size() + " records");

    long inFileSize = Sampler.sizeOfLastProcessedFile;

    // Compute an approximate MBR to determine the desired number of rows
    // and columns
    Prism approxMBR = new Prism(Double.MAX_VALUE, Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE,
            -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Point3d pt : sample) {
        approxMBR.expand(pt);
    }
    GridInfo gridInfo = new GridInfo(approxMBR.t1, approxMBR.x1, approxMBR.y1, approxMBR.t2, approxMBR.x2,
            approxMBR.y2);
    gridInfo.calculateCellDimensions(Math.max(1, (int) ((inFileSize + blocksize / 2) / blocksize)));
    gridInfo.set(-Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE, Double.MAX_VALUE, Double.MAX_VALUE,
            Double.MAX_VALUE);

    Prism[] Prisms = RTree.packInPrisms(gridInfo, sample.toArray(new Point3d[sample.size()]));
    CellInfo[] cellsInfo = new CellInfo[Prisms.length];
    for (int i = 0; i < Prisms.length; i++)
        cellsInfo[i] = new CellInfo(i + 1, Prisms[i]);

    return cellsInfo;
}