Example usage for org.apache.hadoop.fs FileSystem listFiles

List of usage examples for org.apache.hadoop.fs FileSystem listFiles

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listFiles.

Prototype

public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)
        throws FileNotFoundException, IOException 

Source Link

Document

List the statuses and block locations of the files in the given path.

Usage

From source file:org.deeplearning4j.hadoop.datasetiterator.BaseHdfsDataSetIterator.java

License:Apache License

/**
 * List all of the files in the //from w  w  w  .j  a va2  s  . c o m
 * hdfsUriRootDir directory
 * @return the list of paths in the directory
 * @throws Exception if one occurs
 */
public List<Path> filesInDir() throws Exception {
    FileSystem fs = FileSystem.get(conf);
    List<Path> paths = new ArrayList<Path>();
    RemoteIterator<LocatedFileStatus> iter = fs.listFiles(new Path(hdfsUriRootDir), true);
    while (iter.hasNext()) {
        LocatedFileStatus l = iter.next();
        paths.add(l.getPath());
    }

    fs.close();
    return paths;

}

From source file:org.deeplearning4j.patent.DownloadPreprocessPatents.java

License:Apache License

public static List<String> listPaths(JavaSparkContext sc, String path, boolean recursive) throws IOException {
    if (path.endsWith(".blob.core.windows.net/") || path.endsWith(".blob.core.windows.net")) {
        //Azure library bug: seems that we get an infinite loop if we try to list paths on the
        // root directory, for some versions of the Azure Hadoop library - deadlocks on fileIter.hasNext()
        throw new IllegalStateException("Cannot list paths from root directory due to Azure library bug");
    }//from   w w  w . ja  v a 2s.  c  o  m

    List<String> paths = new ArrayList<>();
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator fileIter = hdfs.listFiles(new Path(path), recursive);

    while (fileIter.hasNext()) {
        String filePath = ((LocatedFileStatus) fileIter.next()).getPath().toString();
        paths.add(filePath);
    }

    return paths;
}

From source file:org.deeplearning4j.patent.TrainPatentClassifier.java

License:Apache License

private JavaRDD<String> listPathsSubset(JavaSparkContext sc, String path, int max, int rngSeed)
        throws IOException {
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), true);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String filePath = fileIter.next().getPath().toString();
        paths.add(filePath);//  ww w. ja va  2  s.  c om
    }

    //Now, get a consistent random subset - assuming here that file listing isn't consistent
    Collections.sort(paths);
    int[] arr = new int[paths.size()];
    for (int i = 0; i < arr.length; i++) {
        arr[i] = i;
    }
    MathUtils.shuffleArray(arr, rngSeed);

    List<String> out = new ArrayList<>();
    for (int i = 0; i < arr.length && i < max; i++) {
        out.add(paths.get(arr[i]));
    }

    return sc.parallelize(out);
}

From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java

License:Apache License

@Test
public void testFitViaStringPaths() throws Exception {

    Path tempDir = testDir.newFolder("DL4J-testFitViaStringPaths").toPath();
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();/*from ww  w  . java  2 s.c  om*/

    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
            .layer(0,
                    new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                            .activation(Activation.TANH).build())
            .layer(1,
                    new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                            .nIn(50).nOut(10).activation(Activation.SOFTMAX).build())
            .pretrain(false).backprop(true).build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1)
                    .repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);

    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs
            .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());

    sparkNet.getTrainingMaster().deleteTempFiles(sc);
}

From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java

License:Apache License

@Test
public void testFitViaStringPathsSize1() throws Exception {

    Path tempDir = testDir.newFolder("DL4J-testFitViaStringPathsSize1").toPath();
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();/* ww  w.  j a  va2 s  .co  m*/

    int dataSetObjSize = 1;
    int batchSizePerExecutor = 25;
    int numSplits = 10;
    int averagingFrequency = 3;
    int totalExamples = numExecutors() * batchSizePerExecutor * numSplits * averagingFrequency;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, totalExamples, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
            .layer(0,
                    new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                            .activation(Activation.TANH).build())
            .layer(1,
                    new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                            .nIn(50).nOut(10).activation(Activation.SOFTMAX).build())
            .pretrain(false).backprop(true).build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor)
                    .averagingFrequency(averagingFrequency).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);

    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs
            .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    Thread.sleep(2000);
    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();

    //Expect
    System.out.println(stats.statsAsString());
    assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size());

    List<EventStats> list = stats.getValue("ParameterAveragingWorkerFitTimesMs");
    assertEquals(numSplits * numExecutors() * averagingFrequency, list.size());
    for (EventStats es : list) {
        ExampleCountEventStats e = (ExampleCountEventStats) es;
        assertTrue(batchSizePerExecutor * averagingFrequency - 10 >= e.getTotalExampleCount());
    }

    sparkNet.getTrainingMaster().deleteTempFiles(sc);
}

From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java

License:Apache License

@Test
public void testFitViaStringPathsCompGraph() throws Exception {

    Path tempDir = testDir.newFolder("DL4J-testFitViaStringPathsCG").toPath();
    Path tempDir2 = testDir.newFolder("DL4J-testFitViaStringPathsCG-MDS").toPath();
    File tempDirF = tempDir.toFile();
    File tempDirF2 = tempDir2.toFile();
    tempDirF.deleteOnExit();/* w  ww . j  a va 2s .c o  m*/
    tempDirF2.deleteOnExit();

    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        File nextFile2 = new File(tempDirF2, i + ".bin");
        DataSet ds = iter.next();
        MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
        ds.save(nextFile);
        mds.save(nextFile2);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());
    System.out.println("Saved to: " + tempDirF2.getAbsolutePath());

    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).graphBuilder().addInputs("in")
            .addLayer("0",
                    new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                            .activation(Activation.TANH).build(),
                    "in")
            .addLayer("1",
                    new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                            .nIn(50).nOut(10).activation(Activation.SOFTMAX).build(),
                    "0")
            .setOutputs("1").pretrain(false).backprop(true).build();

    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5).workerPrefetchNumBatches(0)
                    .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1)
                    .repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);

    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs
            .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());

    //Same thing, buf for MultiDataSet objects:
    config = new Configuration();
    hdfs = FileSystem.get(tempDir2.toUri(), config);
    fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false);

    paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    paramsBefore = sparkNet.getNetwork().params().dup();
    pathRdd = sc.parallelize(paths);
    sparkNet.fitPathsMultiDataSet(pathRdd);

    paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
}

From source file:org.icgc.dcc.download.client.io.ArchiveOutputStream.java

License:Open Source License

@SneakyThrows
private long calculateDataTypeArchiveSize(FileSystem fileSystem, Path downloadTypePath) {
    val files = fileSystem.listFiles(downloadTypePath, false);

    long totalSize = 0L;
    while (files.hasNext()) {
        val file = files.next();
        if (isPartFile(file.getPath())) {
            totalSize += file.getLen();//from   www .  j  ava2 s.com
        }
    }

    return totalSize;
}

From source file:org.icgc.dcc.release.core.util.HadoopFileSystemUtils.java

License:Open Source License

private static List<LocatedFileStatus> getFiles(FileSystem fileSystem, Path target, boolean recusre) {
    val results = Lists.<LocatedFileStatus>newArrayList();
    RemoteIterator<LocatedFileStatus> fileStatusListIterator = null;
    try {/*from  w w  w . j a  v  a 2 s .  c o m*/
        fileStatusListIterator = fileSystem.listFiles(target, true);
        while (fileStatusListIterator.hasNext()) {
            LocatedFileStatus fileStatus = fileStatusListIterator.next();
            results.add(fileStatus);
        }
    } catch (IOException e) {
        log.info("Error retriving files in path '{}'", target);
    }
    return results;
}

From source file:org.springframework.cloud.dataflow.yarn.buildtests.AbstractCliBootYarnClusterTests.java

License:Apache License

protected String dumpFs() throws IOException {
    StringBuilder buf = new StringBuilder();
    FileSystem fs = FileSystem.get(getConfiguration());
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/"), true);
    while (files.hasNext()) {
        buf.append(files.next().toString());
        buf.append("\n");
    }/*ww  w  .j  a  va  2 s .  c  o m*/
    return buf.toString();
}

From source file:org.trend.hgraph.util.test.GetRandomRowsByRegionsTest.java

License:Apache License

@Test
public void test_run_b2t3() throws Exception {
    String outputPath = "/run_b2t3";
    GetRandomRowsByRegions tool = new GetRandomRowsByRegions(TEST_UTIL.getConfiguration());
    int status = tool.run(new String[] { "-b", "2", "-t", "3", TABLE, outputPath });
    Assert.assertEquals(0, status);//  w ww. j av a  2 s . co m
    // get content, for manual test purpose
    Path path = new Path(outputPath);
    FileSystem fs = path.getFileSystem(TEST_UTIL.getConfiguration());
    RemoteIterator<LocatedFileStatus> it = fs.listFiles(path, false);
    LocatedFileStatus lfs = null;
    InputStream is = null;
    String fn = null;
    while (it.hasNext()) {
        lfs = it.next();
        fn = lfs.getPath().getName();
        if (fn.startsWith("part-")) {
            System.out.println("content for file:" + fn);
            is = fs.open(lfs.getPath());
            System.out.println(IOUtils.toString(is));
            IOUtils.closeQuietly(is);
        }
    }
}