Example usage for org.apache.hadoop.fs FileSystem listFiles

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listFiles.

Prototype

public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)
        throws FileNotFoundException, IOException

Source Link

Document

List the statuses and block locations of the files in the given path.

Usage

From source file:org.deeplearning4j.hadoop.datasetiterator.BaseHdfsDataSetIterator.java

License:Apache License

/**
 * List all of the files in the //from w  w  w  .j  a va2  s  . c o m
 * hdfsUriRootDir directory
 * @return the list of paths in the directory
 * @throws Exception if one occurs
 */
public List<Path> filesInDir() throws Exception {
    FileSystem fs = FileSystem.get(conf);
    List<Path> paths = new ArrayList<Path>();
    RemoteIterator<LocatedFileStatus> iter = fs.listFiles(new Path(hdfsUriRootDir), true);
    while (iter.hasNext()) {
        LocatedFileStatus l = iter.next();
        paths.add(l.getPath());
    }

    fs.close();
    return paths;

}

From source file:org.deeplearning4j.patent.DownloadPreprocessPatents.java

License:Apache License

public static List<String> listPaths(JavaSparkContext sc, String path, boolean recursive) throws IOException {
    if (path.endsWith(".blob.core.windows.net/") || path.endsWith(".blob.core.windows.net")) {
        //Azure library bug: seems that we get an infinite loop if we try to list paths on the
        // root directory, for some versions of the Azure Hadoop library - deadlocks on fileIter.hasNext()
        throw new IllegalStateException("Cannot list paths from root directory due to Azure library bug");
    }//from   w w  w . ja  v a 2s.  c  o  m

    List<String> paths = new ArrayList<>();
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator fileIter = hdfs.listFiles(new Path(path), recursive);

    while (fileIter.hasNext()) {
        String filePath = ((LocatedFileStatus) fileIter.next()).getPath().toString();
        paths.add(filePath);
    }

    return paths;
}

From source file:org.deeplearning4j.patent.TrainPatentClassifier.java

License:Apache License

private JavaRDD<String> listPathsSubset(JavaSparkContext sc, String path, int max, int rngSeed)
        throws IOException {
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), true);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String filePath = fileIter.next().getPath().toString();
        paths.add(filePath);//  ww w. ja va  2  s.  c om
    }

    //Now, get a consistent random subset - assuming here that file listing isn't consistent
    Collections.sort(paths);
    int[] arr = new int[paths.size()];
    for (int i = 0; i < arr.length; i++) {
        arr[i] = i;
    }
    MathUtils.shuffleArray(arr, rngSeed);

    List<String> out = new ArrayList<>();
    for (int i = 0; i < arr.length && i < max; i++) {
        out.add(paths.get(arr[i]));
    }

    return sc.parallelize(out);
}

From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java

License:Apache License

@Test
public void testFitViaStringPaths() throws Exception {

    Path tempDir = testDir.newFolder("DL4J-testFitViaStringPaths").toPath();
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();/*from ww  w  . java  2 s.c  om*/

    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
            .layer(0,
                    new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                            .activation(Activation.TANH).build())
            .layer(1,
                    new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                            .nIn(50).nOut(10).activation(Activation.SOFTMAX).build())
            .pretrain(false).backprop(true).build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1)
                    .repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);

    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs
            .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());

    sparkNet.getTrainingMaster().deleteTempFiles(sc);
}

From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java

License:Apache License

@Test
public void testFitViaStringPathsSize1() throws Exception {

    Path tempDir = testDir.newFolder("DL4J-testFitViaStringPathsSize1").toPath();
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();/* ww  w.  j a  va2 s  .co  m*/

    int dataSetObjSize = 1;
    int batchSizePerExecutor = 25;
    int numSplits = 10;
    int averagingFrequency = 3;
    int totalExamples = numExecutors() * batchSizePerExecutor * numSplits * averagingFrequency;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, totalExamples, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());

    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
            .layer(0,
                    new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                            .activation(Activation.TANH).build())
            .layer(1,
                    new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                            .nIn(50).nOut(10).activation(Activation.SOFTMAX).build())
            .pretrain(false).backprop(true).build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor)
                    .averagingFrequency(averagingFrequency).repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);

    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs
            .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    Thread.sleep(2000);
    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();

    //Expect
    System.out.println(stats.statsAsString());
    assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size());

    List<EventStats> list = stats.getValue("ParameterAveragingWorkerFitTimesMs");
    assertEquals(numSplits * numExecutors() * averagingFrequency, list.size());
    for (EventStats es : list) {
        ExampleCountEventStats e = (ExampleCountEventStats) es;
        assertTrue(batchSizePerExecutor * averagingFrequency - 10 >= e.getTotalExampleCount());
    }

    sparkNet.getTrainingMaster().deleteTempFiles(sc);
}

From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java

License:Apache License

@Test
public void testFitViaStringPathsCompGraph() throws Exception {

    Path tempDir = testDir.newFolder("DL4J-testFitViaStringPathsCG").toPath();
    Path tempDir2 = testDir.newFolder("DL4J-testFitViaStringPathsCG-MDS").toPath();
    File tempDirF = tempDir.toFile();
    File tempDirF2 = tempDir2.toFile();
    tempDirF.deleteOnExit();/* w  ww . j  a va 2s .c o  m*/
    tempDirF2.deleteOnExit();

    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
    int i = 0;
    while (iter.hasNext()) {
        File nextFile = new File(tempDirF, i + ".bin");
        File nextFile2 = new File(tempDirF2, i + ".bin");
        DataSet ds = iter.next();
        MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels());
        ds.save(nextFile);
        mds.save(nextFile2);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());
    System.out.println("Saved to: " + tempDirF2.getAbsolutePath());

    ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).graphBuilder().addInputs("in")
            .addLayer("0",
                    new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                            .activation(Activation.TANH).build(),
                    "in")
            .addLayer("1",
                    new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                            .nIn(50).nOut(10).activation(Activation.SOFTMAX).build(),
                    "0")
            .setOutputs("1").pretrain(false).backprop(true).build();

    SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5).workerPrefetchNumBatches(0)
                    .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1)
                    .repartionData(Repartition.Always).build());
    sparkNet.setCollectTrainingStats(true);

    //List files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs
            .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());

    //Same thing, buf for MultiDataSet objects:
    config = new Configuration();
    hdfs = FileSystem.get(tempDir2.toUri(), config);
    fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false);

    paths = new ArrayList<>();
    while (fileIter.hasNext()) {
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    paramsBefore = sparkNet.getNetwork().params().dup();
    pathRdd = sc.parallelize(paths);
    sparkNet.fitPathsMultiDataSet(pathRdd);

    paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
}

From source file:org.icgc.dcc.download.client.io.ArchiveOutputStream.java

License:Open Source License

@SneakyThrows
private long calculateDataTypeArchiveSize(FileSystem fileSystem, Path downloadTypePath) {
    val files = fileSystem.listFiles(downloadTypePath, false);

    long totalSize = 0L;
    while (files.hasNext()) {
        val file = files.next();
        if (isPartFile(file.getPath())) {
            totalSize += file.getLen();//from   www .  j  ava2 s.com
        }
    }

    return totalSize;
}

From source file:org.icgc.dcc.release.core.util.HadoopFileSystemUtils.java

License:Open Source License

private static List<LocatedFileStatus> getFiles(FileSystem fileSystem, Path target, boolean recusre) {
    val results = Lists.<LocatedFileStatus>newArrayList();
    RemoteIterator<LocatedFileStatus> fileStatusListIterator = null;
    try {/*from  w w  w . j a  v  a 2 s .  c o m*/
        fileStatusListIterator = fileSystem.listFiles(target, true);
        while (fileStatusListIterator.hasNext()) {
            LocatedFileStatus fileStatus = fileStatusListIterator.next();
            results.add(fileStatus);
        }
    } catch (IOException e) {
        log.info("Error retriving files in path '{}'", target);
    }
    return results;
}

From source file:org.springframework.cloud.dataflow.yarn.buildtests.AbstractCliBootYarnClusterTests.java

License:Apache License

protected String dumpFs() throws IOException {
    StringBuilder buf = new StringBuilder();
    FileSystem fs = FileSystem.get(getConfiguration());
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/"), true);
    while (files.hasNext()) {
        buf.append(files.next().toString());
        buf.append("\n");
    }/*ww  w  .j  a  va  2 s .  c  o m*/
    return buf.toString();
}

From source file:org.trend.hgraph.util.test.GetRandomRowsByRegionsTest.java

License:Apache License

@Test
public void test_run_b2t3() throws Exception {
    String outputPath = "/run_b2t3";
    GetRandomRowsByRegions tool = new GetRandomRowsByRegions(TEST_UTIL.getConfiguration());
    int status = tool.run(new String[] { "-b", "2", "-t", "3", TABLE, outputPath });
    Assert.assertEquals(0, status);//  w ww. j av a  2 s . co m
    // get content, for manual test purpose
    Path path = new Path(outputPath);
    FileSystem fs = path.getFileSystem(TEST_UTIL.getConfiguration());
    RemoteIterator<LocatedFileStatus> it = fs.listFiles(path, false);
    LocatedFileStatus lfs = null;
    InputStream is = null;
    String fn = null;
    while (it.hasNext()) {
        lfs = it.next();
        fn = lfs.getPath().getName();
        if (fn.startsWith("part-")) {
            System.out.println("content for file:" + fn);
            is = fs.open(lfs.getPath());
            System.out.println(IOUtils.toString(is));
            IOUtils.closeQuietly(is);
        }
    }
}