Example usage for org.apache.hadoop.fs FileSystem listFiles

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listFiles.

Prototype

public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)
        throws FileNotFoundException, IOException

Source Link

Document

List the statuses and block locations of the files in the given path.

Usage

From source file:org.apache.nutch.tools.CommonCrawlDataDumper.java

License:Apache License

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped./*from   w w w  . j  av  a 2 s. c o  m*/
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes,
        boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();

    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);

    //get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}"
            + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }

    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }

    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
                    SequenceFile.Reader.file(segmentPart));

            Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();

            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();

                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);

                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }

                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;

                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z")
                                .parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }

                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(
                            reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }

                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(),
                                reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0,
                                outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(),
                                md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);

                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);

                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure

                    Set<String> inUrls = null; //there may be duplicates, so using set
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    //TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory
                            .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }

                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null)
                            && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }

                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);

                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData),
                                        new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(
                                        outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }

    if (gzip && !warc) {
        closeStream();
    }

    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: "
                + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }

}

From source file:org.apache.phoenix.compile.ListJarsQueryPlan.java

License:Apache License

@Override
public ResultIterator iterator(ParallelScanGrouper scanGrouper) throws SQLException {
    return new ResultIterator() {
        private RemoteIterator<LocatedFileStatus> listFiles = null;

        @Override/*w ww  .  j  a va  2  s .co m*/
        public void close() throws SQLException {

        }

        @Override
        public Tuple next() throws SQLException {
            try {
                if (first) {
                    String dynamicJarsDir = stmt.getConnection().getQueryServices().getProps()
                            .get(QueryServices.DYNAMIC_JARS_DIR_KEY);
                    if (dynamicJarsDir == null) {
                        throw new SQLException(QueryServices.DYNAMIC_JARS_DIR_KEY
                                + " is not configured for the listing the jars.");
                    }
                    dynamicJarsDir = dynamicJarsDir.endsWith("/") ? dynamicJarsDir : dynamicJarsDir + '/';
                    Configuration conf = HBaseFactoryProvider.getConfigurationFactory().getConfiguration();
                    Path dynamicJarsDirPath = new Path(dynamicJarsDir);
                    FileSystem fs = dynamicJarsDirPath.getFileSystem(conf);
                    listFiles = fs.listFiles(dynamicJarsDirPath, true);
                    first = false;
                }
                if (listFiles == null || !listFiles.hasNext())
                    return null;
                ImmutableBytesWritable ptr = new ImmutableBytesWritable();
                ParseNodeFactory factory = new ParseNodeFactory();
                LiteralParseNode literal = factory.literal(listFiles.next().getPath().toString());
                LiteralExpression expression = LiteralExpression.newConstant(literal.getValue(),
                        PVarchar.INSTANCE, Determinism.ALWAYS);
                expression.evaluate(null, ptr);
                byte[] rowKey = ByteUtil.copyKeyBytesIfNecessary(ptr);
                Cell cell = CellUtil.createCell(rowKey, HConstants.EMPTY_BYTE_ARRAY,
                        HConstants.EMPTY_BYTE_ARRAY, EnvironmentEdgeManager.currentTimeMillis(),
                        Type.Put.getCode(), HConstants.EMPTY_BYTE_ARRAY);
                List<Cell> cells = new ArrayList<Cell>(1);
                cells.add(cell);
                return new ResultTuple(Result.create(cells));
            } catch (IOException e) {
                throw new SQLException(e);
            }
        }

        @Override
        public void explain(List<String> planSteps) {
        }
    };
}

From source file:org.apache.ranger.services.hdfs.HDFSRangerTest.java

License:Apache License

@org.junit.Test
public void executeTest() throws Exception {
    FileSystem fileSystem = hdfsCluster.getFileSystem();

    // Write a file - the AccessControlEnforcer won't be invoked as we are the "superuser"
    final Path file = new Path("/tmp/tmpdir3/data-file2");
    FSDataOutputStream out = fileSystem.create(file);
    for (int i = 0; i < 1024; ++i) {
        out.write(("data" + i + "\n").getBytes("UTF-8"));
        out.flush();//from w ww. j  ava  2s.c o m
    }
    out.close();

    // Change permissions to read-only
    fileSystem.setPermission(file, new FsPermission(FsAction.READ, FsAction.NONE, FsAction.NONE));

    // Change the parent directory permissions to be execute only for the owner
    Path parentDir = new Path("/tmp/tmpdir3");
    fileSystem.setPermission(parentDir, new FsPermission(FsAction.EXECUTE, FsAction.NONE, FsAction.NONE));

    // Try to read the directory as "bob" - this should be allowed (by the policy - user)
    UserGroupInformation ugi = UserGroupInformation.createUserForTesting("bob", new String[] {});
    ugi.doAs(new PrivilegedExceptionAction<Void>() {

        public Void run() throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", defaultFs);

            FileSystem fs = FileSystem.get(conf);

            RemoteIterator<LocatedFileStatus> iter = fs.listFiles(file.getParent(), false);
            Assert.assertTrue(iter.hasNext());

            fs.close();
            return null;
        }
    });

    // Try to read the directory as "alice" - this should be allowed (by the policy - group)
    ugi = UserGroupInformation.createUserForTesting("alice", new String[] { "IT" });
    ugi.doAs(new PrivilegedExceptionAction<Void>() {

        public Void run() throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", defaultFs);

            FileSystem fs = FileSystem.get(conf);

            RemoteIterator<LocatedFileStatus> iter = fs.listFiles(file.getParent(), false);
            Assert.assertTrue(iter.hasNext());

            fs.close();
            return null;
        }
    });

    // Now try to read the directory as unknown user "eve" - this should not be allowed
    ugi = UserGroupInformation.createUserForTesting("eve", new String[] {});
    ugi.doAs(new PrivilegedExceptionAction<Void>() {

        public Void run() throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", defaultFs);

            FileSystem fs = FileSystem.get(conf);

            // Write to the file
            try {
                RemoteIterator<LocatedFileStatus> iter = fs.listFiles(file.getParent(), false);
                Assert.assertTrue(iter.hasNext());
                Assert.fail("Failure expected on an incorrect permission");
            } catch (RemoteException ex) {
                // expected
                Assert.assertTrue(RangerAccessControlException.class.getName().equals(ex.getClassName()));
            }

            fs.close();
            return null;
        }
    });

}

From source file:org.apache.sentry.tests.e2e.hdfs.TestHDFSIntegration.java

License:Apache License

private void verifyAccessToPath(String user, String group, String path, boolean hasPermission)
        throws Exception {
    Path p = new Path(path);
    UserGroupInformation hadoopUser = UserGroupInformation.createUserForTesting(user, new String[] { group });
    FileSystem fs = DFSTestUtil.getFileSystemAs(hadoopUser, hadoopConf);
    try {//from  w ww.  ja v  a2  s .  c om
        fs.listFiles(p, true);
        if (!hasPermission) {
            Assert.assertFalse("Expected listing files to fail", false);
        }
    } catch (Exception e) {
        if (hasPermission) {
            throw e;
        }
    }
}

From source file:org.apache.sentry.tests.e2e.hdfs.TestHDFSIntegrationBase.java

License:Apache License

protected void verifyAccessToPath(String user, String group, String path, boolean hasPermission)
        throws Exception {
    Path p = new Path(path);
    FileSystem fs = miniDFS.getFileSystem();
    try {/*from w  w  w.  java 2  s . co  m*/
        fs.listFiles(p, true);
        if (!hasPermission) {
            assertFalse("Expected listing files to fail", false);
        }
    } catch (Exception e) {
        if (hasPermission) {
            throw e;
        }
    }
}

From source file:org.apache.tajo.engine.planner.TestPlannerUtil.java

License:Apache License

@Test
public void testGetNonZeroLengthDataFiles() throws Exception {
    String queryFiles = ClassLoader.getSystemResource("queries").toString() + "/TestSelectQuery";
    Path path = new Path(queryFiles);

    TableDesc tableDesc = new TableDesc();
    tableDesc.setName("Test");
    tableDesc.setPath(path.toUri());// w  ww.  j a  v  a  2s  . c om

    FileSystem fs = path.getFileSystem(util.getConfiguration());

    List<Path> expectedFiles = new ArrayList<Path>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(path, true);
    while (files.hasNext()) {
        LocatedFileStatus file = files.next();
        if (file.isFile() && file.getLen() > 0) {
            expectedFiles.add(file.getPath());
        }
    }
    int fileNum = expectedFiles.size() / 5;

    int numResultFiles = 0;
    for (int i = 0; i <= 5; i++) {
        int start = i * fileNum;

        FragmentProto[] fragments = PhysicalPlanUtil.getNonZeroLengthDataFiles(util.getConfiguration(),
                tableDesc, start, fileNum);
        assertNotNull(fragments);

        numResultFiles += fragments.length;
        int expectedSize = fileNum;
        if (i == 5) {
            //last
            expectedSize = expectedFiles.size() - (fileNum * 5);
        }

        comparePath(expectedFiles, fragments, start, expectedSize);
    }

    assertEquals(expectedFiles.size(), numResultFiles);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.AbstractHadoopGraphComputer.java

License:Apache License

public static File copyDirectoryIfNonExistent(final FileSystem fileSystem, final String directory) {
    try {//  w w  w .j  a  v  a2 s  . c  o  m
        final String hadoopGremlinLibsRemote = "hadoop-gremlin-" + Gremlin.version() + "-libs";
        final Path path = new Path(directory);
        if (Boolean.valueOf(System.getProperty("is.testing", "false"))
                || (fileSystem.exists(path) && fileSystem.isDirectory(path))) {
            final File tempDirectory = new File(
                    System.getProperty("java.io.tmpdir") + File.separator + hadoopGremlinLibsRemote);
            assert tempDirectory.exists() || tempDirectory.mkdirs();
            final String tempPath = tempDirectory.getAbsolutePath() + File.separator + path.getName();
            final RemoteIterator<LocatedFileStatus> files = fileSystem.listFiles(path, false);
            while (files.hasNext()) {
                final LocatedFileStatus f = files.next();
                fileSystem.copyToLocalFile(false, f.getPath(),
                        new Path(tempPath + System.getProperty("file.separator") + f.getPath().getName()),
                        true);
            }
            return new File(tempPath);
        } else
            return new File(directory);
    } catch (final IOException e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

From source file:org.apache.vxquery.metadata.VXQueryCollectionOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions)
        throws HyracksDataException {
    final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(),
            recordDescProvider.getInputRecordDescriptor(getActivityId(), 0));
    final int fieldOutputCount = recordDescProvider.getOutputRecordDescriptor(getActivityId(), 0)
            .getFieldCount();/*from w  ww .  j av  a 2 s.  co  m*/
    final ByteBuffer frame = ctx.allocateFrame();
    final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize(), fieldOutputCount);
    final short partitionId = (short) ctx.getTaskAttemptId().getTaskId().getPartition();
    final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider(partitionId, dataSourceId,
            totalDataSources);
    final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId();
    final DynamicContext dCtx = (DynamicContext) ctx.getJobletContext().getGlobalJobData();

    final String collectionName = collectionPartitions[partition % collectionPartitions.length];
    final XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId, frame, appender, childSeq,
            dCtx.getStaticContext());

    return new AbstractUnaryInputUnaryOutputOperatorNodePushable() {
        @Override
        public void open() throws HyracksDataException {
            appender.reset(frame, true);
            writer.open();
            hdfs = new HDFSFunctions();
        }

        @Override
        public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
            fta.reset(buffer);
            String collectionModifiedName = collectionName.replace("${nodeId}", nodeId);
            if (!collectionModifiedName.contains("hdfs:/")) {
                File collectionDirectory = new File(collectionModifiedName);
                //check if directory is in the local file system
                if (collectionDirectory.exists()) {
                    // Go through each tuple.
                    if (collectionDirectory.isDirectory()) {
                        for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                            Iterator<File> it = FileUtils.iterateFiles(collectionDirectory,
                                    new VXQueryIOFileFilter(), TrueFileFilter.INSTANCE);
                            while (it.hasNext()) {
                                File xmlDocument = it.next();
                                if (LOGGER.isLoggable(Level.FINE)) {
                                    LOGGER.fine(
                                            "Starting to read XML document: " + xmlDocument.getAbsolutePath());
                                }
                                parser.parseElements(xmlDocument, writer, fta, tupleIndex);
                            }
                        }
                    } else {
                        throw new HyracksDataException("Invalid directory parameter (" + nodeId + ":"
                                + collectionDirectory.getAbsolutePath() + ") passed to collection.");
                    }
                }
            } else {
                // Else check in HDFS file system
                // Get instance of the HDFS filesystem
                FileSystem fs = hdfs.getFileSystem();
                if (fs != null) {
                    collectionModifiedName = collectionModifiedName.replaceAll("hdfs:/", "");
                    Path directory = new Path(collectionModifiedName);
                    Path xmlDocument;
                    if (tag != null) {
                        hdfs.setJob(directory.getName(), tag);
                        tag = "<" + tag + ">";
                        Job job = hdfs.getJob();
                        InputFormat inputFormat = hdfs.getinputFormat();
                        try {
                            hdfs.scheduleSplits();
                            ArrayList<Integer> schedule = hdfs
                                    .getScheduleForNode(InetAddress.getLocalHost().getHostName());
                            List<InputSplit> splits = hdfs.getSplits();
                            List<FileSplit> fileSplits = new ArrayList<FileSplit>();
                            for (int i : schedule) {
                                fileSplits.add((FileSplit) splits.get(i));
                            }
                            FileSplitsFactory splitsFactory = new FileSplitsFactory(fileSplits);
                            List<FileSplit> inputSplits = splitsFactory.getSplits();
                            ContextFactory ctxFactory = new ContextFactory();
                            int size = inputSplits.size();
                            InputStream stream;
                            String value;
                            RecordReader reader;
                            TaskAttemptContext context;
                            for (int i = 0; i < size; i++) {
                                //read split
                                context = ctxFactory.createContext(job.getConfiguration(), i);
                                try {
                                    reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                                    reader.initialize(inputSplits.get(i), context);
                                    while (reader.nextKeyValue()) {
                                        value = reader.getCurrentValue().toString();
                                        //Split value if it contains more than one item with the tag
                                        if (StringUtils.countMatches(value, tag) > 1) {
                                            String items[] = value.split(tag);
                                            for (String item : items) {
                                                if (item.length() > 0) {
                                                    item = START_TAG + tag + item;
                                                    stream = new ByteArrayInputStream(
                                                            item.getBytes(StandardCharsets.UTF_8));
                                                    parser.parseHDFSElements(stream, writer, fta, i);
                                                }
                                            }
                                        } else {
                                            value = START_TAG + value;
                                            //create an input stream to the file currently reading and send it to parser
                                            stream = new ByteArrayInputStream(
                                                    value.getBytes(StandardCharsets.UTF_8));
                                            parser.parseHDFSElements(stream, writer, fta, i);
                                        }
                                    }

                                } catch (InterruptedException e) {
                                    if (LOGGER.isLoggable(Level.SEVERE)) {
                                        LOGGER.severe(e.getMessage());
                                    }
                                }
                            }

                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (ParserConfigurationException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (SAXException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    } else {
                        try {
                            //check if the path exists and is a directory
                            if (fs.exists(directory) && fs.isDirectory(directory)) {
                                for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                                    //read every file in the directory
                                    RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true);
                                    while (it.hasNext()) {
                                        xmlDocument = it.next().getPath();
                                        if (fs.isFile(xmlDocument)) {
                                            if (LOGGER.isLoggable(Level.FINE)) {
                                                LOGGER.fine("Starting to read XML document: "
                                                        + xmlDocument.getName());
                                            }
                                            //create an input stream to the file currently reading and send it to parser
                                            InputStream in = fs.open(xmlDocument).getWrappedStream();
                                            parser.parseHDFSElements(in, writer, fta, tupleIndex);
                                        }
                                    }
                                }
                            } else {
                                throw new HyracksDataException("Invalid HDFS directory parameter (" + nodeId
                                        + ":" + directory + ") passed to collection.");
                            }
                        } catch (FileNotFoundException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    }
                    try {
                        fs.close();
                    } catch (IOException e) {
                        if (LOGGER.isLoggable(Level.SEVERE)) {
                            LOGGER.severe(e.getMessage());
                        }
                    }
                }
            }
        }

        @Override
        public void fail() throws HyracksDataException {
            writer.fail();
        }

        @Override
        public void close() throws HyracksDataException {
            // Check if needed?
            fta.reset(frame);
            if (fta.getTupleCount() > 0) {
                FrameUtils.flushFrame(frame, writer);
            }
            writer.close();
        }
    };
}

From source file:org.datacleaner.spark.ApplicationDriver.java

License:Open Source License

private List<String> buildJarFiles(MutableRef<String> primaryJarRef) throws IOException {
    final List<String> list = new ArrayList<>();

    final Configuration conf = new Configuration();
    conf.set("fs.defaultFS", "hdfs://" + _hostname + ":" + _port);

    final FileSystem fs = FileSystem.newInstance(conf);
    try {//from w  ww.ja v a  2  s. c  o m
        final Path directoryPath = new Path(_jarDirectoryPath);
        final RemoteIterator<LocatedFileStatus> files = fs.listFiles(directoryPath, false);
        while (files.hasNext()) {
            final LocatedFileStatus file = files.next();
            final Path path = file.getPath();
            final String filename = path.getName();
            if (filename.startsWith(PRIMARY_JAR_FILENAME_PREFIX)) {
                primaryJarRef.set(path.toString());
            } else {
                list.add(path.toString());
            }
        }
    } finally {
        FileHelper.safeClose(fs);
    }

    if (primaryJarRef.get() == null) {
        throw new IllegalArgumentException("Failed to find primary jar (starting with '"
                + PRIMARY_JAR_FILENAME_PREFIX + "') in JAR file directory: " + _jarDirectoryPath);
    }

    return list;
}

From source file:org.deeplearning4j.datasets.DatasetReaderFromHdfs.java

License:Apache License

protected void doInitialize() {
    FileSystem fs = CommonUtils.openHdfsConnect();
    try {/*w w w. ja va2s.  c  o m*/
        if (train) {
            hdfsIter = fs.listFiles(new Path(CommonUtils.TRAIN_HDFS_PATH), true);
        } else {
            hdfsIter = fs.listFiles(new Path(CommonUtils.VALIDATE_HDFS_PATH), true);
        }
        while (hdfsIter.hasNext()) {
            LocatedFileStatus next = hdfsIter.next();
            Path path = next.getPath();
            String currentPath = path.toUri().getPath();
            fileNames.add(path.toString());
            String name = FilenameUtils.getBaseName((new File(currentPath)).getParent());
            if (!labels.contains(name)) {
                labels.add(name);
            }

        }
        Collections.shuffle(fileNames);
        fileIterator = fileNames.iterator();
        numExample = fileNames.size();
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        CommonUtils.closeHdfsConnect(fs);
    }
}