List of usage examples for org.apache.hadoop.fs FileSystem listFiles
public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive) throws FileNotFoundException, IOException
From source file:org.deeplearning4j.hadoop.datasetiterator.BaseHdfsDataSetIterator.java
License:Apache License
/** * List all of the files in the //from w w w .j a va2 s . c o m * hdfsUriRootDir directory * @return the list of paths in the directory * @throws Exception if one occurs */ public List<Path> filesInDir() throws Exception { FileSystem fs = FileSystem.get(conf); List<Path> paths = new ArrayList<Path>(); RemoteIterator<LocatedFileStatus> iter = fs.listFiles(new Path(hdfsUriRootDir), true); while (iter.hasNext()) { LocatedFileStatus l = iter.next(); paths.add(l.getPath()); } fs.close(); return paths; }
From source file:org.deeplearning4j.patent.DownloadPreprocessPatents.java
License:Apache License
public static List<String> listPaths(JavaSparkContext sc, String path, boolean recursive) throws IOException { if (path.endsWith(".blob.core.windows.net/") || path.endsWith(".blob.core.windows.net")) { //Azure library bug: seems that we get an infinite loop if we try to list paths on the // root directory, for some versions of the Azure Hadoop library - deadlocks on fileIter.hasNext() throw new IllegalStateException("Cannot list paths from root directory due to Azure library bug"); }//from w w w . ja v a 2s. c o m List<String> paths = new ArrayList<>(); Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(URI.create(path), config); RemoteIterator fileIter = hdfs.listFiles(new Path(path), recursive); while (fileIter.hasNext()) { String filePath = ((LocatedFileStatus) fileIter.next()).getPath().toString(); paths.add(filePath); } return paths; }
From source file:org.deeplearning4j.patent.TrainPatentClassifier.java
License:Apache License
private JavaRDD<String> listPathsSubset(JavaSparkContext sc, String path, int max, int rngSeed) throws IOException { Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(URI.create(path), config); RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), true); List<String> paths = new ArrayList<>(); while (fileIter.hasNext()) { String filePath = fileIter.next().getPath().toString(); paths.add(filePath);// ww w. ja va 2 s. c om } //Now, get a consistent random subset - assuming here that file listing isn't consistent Collections.sort(paths); int[] arr = new int[paths.size()]; for (int i = 0; i < arr.length; i++) { arr[i] = i; } MathUtils.shuffleArray(arr, rngSeed); List<String> out = new ArrayList<>(); for (int i = 0; i < arr.length && i < max; i++) { out.add(paths.get(arr[i])); } return sc.parallelize(out); }
From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java
License:Apache License
@Test public void testFitViaStringPaths() throws Exception { Path tempDir = testDir.newFolder("DL4J-testFitViaStringPaths").toPath(); File tempDirF = tempDir.toFile(); tempDirF.deleteOnExit();/*from ww w . java 2 s.c om*/ int dataSetObjSize = 5; int batchSizePerExecutor = 25; DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false); int i = 0; while (iter.hasNext()) { File nextFile = new File(tempDirF, i + ".bin"); DataSet ds = iter.next(); ds.save(nextFile); i++; } System.out.println("Saved to: " + tempDirF.getAbsolutePath()); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp()) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list() .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50) .activation(Activation.TANH).build()) .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT) .nIn(50).nOut(10).activation(Activation.SOFTMAX).build()) .pretrain(false).backprop(true).build(); SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize) .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1) .repartionData(Repartition.Always).build()); sparkNet.setCollectTrainingStats(true); //List files: Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(tempDir.toUri(), config); RemoteIterator<LocatedFileStatus> fileIter = hdfs .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false); List<String> paths = new ArrayList<>(); while (fileIter.hasNext()) { String path = fileIter.next().getPath().toString(); paths.add(path); } INDArray paramsBefore = sparkNet.getNetwork().params().dup(); JavaRDD<String> pathRdd = sc.parallelize(paths); sparkNet.fitPaths(pathRdd); INDArray paramsAfter = sparkNet.getNetwork().params().dup(); assertNotEquals(paramsBefore, paramsAfter); SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); System.out.println(stats.statsAsString()); sparkNet.getTrainingMaster().deleteTempFiles(sc); }
From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java
License:Apache License
@Test public void testFitViaStringPathsSize1() throws Exception { Path tempDir = testDir.newFolder("DL4J-testFitViaStringPathsSize1").toPath(); File tempDirF = tempDir.toFile(); tempDirF.deleteOnExit();/* ww w. j a va2 s .co m*/ int dataSetObjSize = 1; int batchSizePerExecutor = 25; int numSplits = 10; int averagingFrequency = 3; int totalExamples = numExecutors() * batchSizePerExecutor * numSplits * averagingFrequency; DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, totalExamples, false); int i = 0; while (iter.hasNext()) { File nextFile = new File(tempDirF, i + ".bin"); DataSet ds = iter.next(); ds.save(nextFile); i++; } System.out.println("Saved to: " + tempDirF.getAbsolutePath()); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp()) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list() .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50) .activation(Activation.TANH).build()) .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT) .nIn(50).nOut(10).activation(Activation.SOFTMAX).build()) .pretrain(false).backprop(true).build(); SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize) .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor) .averagingFrequency(averagingFrequency).repartionData(Repartition.Always).build()); sparkNet.setCollectTrainingStats(true); //List files: Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(tempDir.toUri(), config); RemoteIterator<LocatedFileStatus> fileIter = hdfs .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false); List<String> paths = new ArrayList<>(); while (fileIter.hasNext()) { String path = fileIter.next().getPath().toString(); paths.add(path); } INDArray paramsBefore = sparkNet.getNetwork().params().dup(); JavaRDD<String> pathRdd = sc.parallelize(paths); sparkNet.fitPaths(pathRdd); INDArray paramsAfter = sparkNet.getNetwork().params().dup(); assertNotEquals(paramsBefore, paramsAfter); Thread.sleep(2000); SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); //Expect System.out.println(stats.statsAsString()); assertEquals(numSplits, stats.getValue("ParameterAveragingMasterRepartitionTimesMs").size()); List<EventStats> list = stats.getValue("ParameterAveragingWorkerFitTimesMs"); assertEquals(numSplits * numExecutors() * averagingFrequency, list.size()); for (EventStats es : list) { ExampleCountEventStats e = (ExampleCountEventStats) es; assertTrue(batchSizePerExecutor * averagingFrequency - 10 >= e.getTotalExampleCount()); } sparkNet.getTrainingMaster().deleteTempFiles(sc); }
From source file:org.deeplearning4j.spark.impl.paramavg.TestSparkMultiLayerParameterAveraging.java
License:Apache License
@Test public void testFitViaStringPathsCompGraph() throws Exception { Path tempDir = testDir.newFolder("DL4J-testFitViaStringPathsCG").toPath(); Path tempDir2 = testDir.newFolder("DL4J-testFitViaStringPathsCG-MDS").toPath(); File tempDirF = tempDir.toFile(); File tempDirF2 = tempDir2.toFile(); tempDirF.deleteOnExit();/* w ww . j a va 2s .c o m*/ tempDirF2.deleteOnExit(); int dataSetObjSize = 5; int batchSizePerExecutor = 25; DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false); int i = 0; while (iter.hasNext()) { File nextFile = new File(tempDirF, i + ".bin"); File nextFile2 = new File(tempDirF2, i + ".bin"); DataSet ds = iter.next(); MultiDataSet mds = new MultiDataSet(ds.getFeatures(), ds.getLabels()); ds.save(nextFile); mds.save(nextFile2); i++; } System.out.println("Saved to: " + tempDirF.getAbsolutePath()); System.out.println("Saved to: " + tempDirF2.getAbsolutePath()); ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp()) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).graphBuilder().addInputs("in") .addLayer("0", new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50) .activation(Activation.TANH).build(), "in") .addLayer("1", new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT) .nIn(50).nOut(10).activation(Activation.SOFTMAX).build(), "0") .setOutputs("1").pretrain(false).backprop(true).build(); SparkComputationGraph sparkNet = new SparkComputationGraph(sc, conf, new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize) .workerPrefetchNumBatches(5).workerPrefetchNumBatches(0) .batchSizePerWorker(batchSizePerExecutor).averagingFrequency(1) .repartionData(Repartition.Always).build()); sparkNet.setCollectTrainingStats(true); //List files: Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(tempDir.toUri(), config); RemoteIterator<LocatedFileStatus> fileIter = hdfs .listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false); List<String> paths = new ArrayList<>(); while (fileIter.hasNext()) { String path = fileIter.next().getPath().toString(); paths.add(path); } INDArray paramsBefore = sparkNet.getNetwork().params().dup(); JavaRDD<String> pathRdd = sc.parallelize(paths); sparkNet.fitPaths(pathRdd); INDArray paramsAfter = sparkNet.getNetwork().params().dup(); assertNotEquals(paramsBefore, paramsAfter); SparkTrainingStats stats = sparkNet.getSparkTrainingStats(); System.out.println(stats.statsAsString()); //Same thing, buf for MultiDataSet objects: config = new Configuration(); hdfs = FileSystem.get(tempDir2.toUri(), config); fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir2.toString()), false); paths = new ArrayList<>(); while (fileIter.hasNext()) { String path = fileIter.next().getPath().toString(); paths.add(path); } paramsBefore = sparkNet.getNetwork().params().dup(); pathRdd = sc.parallelize(paths); sparkNet.fitPathsMultiDataSet(pathRdd); paramsAfter = sparkNet.getNetwork().params().dup(); assertNotEquals(paramsBefore, paramsAfter); stats = sparkNet.getSparkTrainingStats(); System.out.println(stats.statsAsString()); }
From source file:org.icgc.dcc.download.client.io.ArchiveOutputStream.java
License:Open Source License
@SneakyThrows private long calculateDataTypeArchiveSize(FileSystem fileSystem, Path downloadTypePath) { val files = fileSystem.listFiles(downloadTypePath, false); long totalSize = 0L; while (files.hasNext()) { val file = files.next(); if (isPartFile(file.getPath())) { totalSize += file.getLen();//from www . j ava2 s.com } } return totalSize; }
From source file:org.icgc.dcc.release.core.util.HadoopFileSystemUtils.java
License:Open Source License
private static List<LocatedFileStatus> getFiles(FileSystem fileSystem, Path target, boolean recusre) { val results = Lists.<LocatedFileStatus>newArrayList(); RemoteIterator<LocatedFileStatus> fileStatusListIterator = null; try {/*from w w w . j a v a 2 s . c o m*/ fileStatusListIterator = fileSystem.listFiles(target, true); while (fileStatusListIterator.hasNext()) { LocatedFileStatus fileStatus = fileStatusListIterator.next(); results.add(fileStatus); } } catch (IOException e) { log.info("Error retriving files in path '{}'", target); } return results; }
From source file:org.springframework.cloud.dataflow.yarn.buildtests.AbstractCliBootYarnClusterTests.java
License:Apache License
protected String dumpFs() throws IOException { StringBuilder buf = new StringBuilder(); FileSystem fs = FileSystem.get(getConfiguration()); RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/"), true); while (files.hasNext()) { buf.append(files.next().toString()); buf.append("\n"); }/*ww w .j a va 2 s . c o m*/ return buf.toString(); }
From source file:org.trend.hgraph.util.test.GetRandomRowsByRegionsTest.java
License:Apache License
@Test public void test_run_b2t3() throws Exception { String outputPath = "/run_b2t3"; GetRandomRowsByRegions tool = new GetRandomRowsByRegions(TEST_UTIL.getConfiguration()); int status = tool.run(new String[] { "-b", "2", "-t", "3", TABLE, outputPath }); Assert.assertEquals(0, status);// w ww. j av a 2 s . co m // get content, for manual test purpose Path path = new Path(outputPath); FileSystem fs = path.getFileSystem(TEST_UTIL.getConfiguration()); RemoteIterator<LocatedFileStatus> it = fs.listFiles(path, false); LocatedFileStatus lfs = null; InputStream is = null; String fn = null; while (it.hasNext()) { lfs = it.next(); fn = lfs.getPath().getName(); if (fn.startsWith("part-")) { System.out.println("content for file:" + fn); is = fs.open(lfs.getPath()); System.out.println(IOUtils.toString(is)); IOUtils.closeQuietly(is); } } }