Example usage for org.apache.hadoop.fs FileSystem listFiles

List of usage examples for org.apache.hadoop.fs FileSystem listFiles

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listFiles.

Prototype

public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)
        throws FileNotFoundException, IOException 

Source Link

Document

List the statuses and block locations of the files in the given path.

Usage

From source file:org.apache.nutch.tools.CommonCrawlDataDumper.java

License:Apache License

/**
 * Dumps the reverse engineered CBOR content from the provided segment
 * directories if a parent directory contains more than one segment,
 * otherwise a single segment can be passed as an argument. If the boolean
 * argument is provided then the CBOR is also zipped.
 *
 * @param outputDir      the directory you wish to dump the raw content to. This
 *                       directory will be created.
 * @param segmentRootDir a directory containing one or more segments.
 * @param linkdb         Path to linkdb.
 * @param gzip           a boolean flag indicating whether the CBOR content should also
 *                       be gzipped./*from   w w w  . j  av  a 2 s. c o  m*/
 * @param epochFilename  if {@code true}, output files will be names using the epoch time (in milliseconds).
 * @param extension      a file extension to use with output documents.
 * @throws Exception if any exception occurs.
 */
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip, String[] mimeTypes,
        boolean epochFilename, String extension, boolean warc) throws Exception {
    if (gzip) {
        LOG.info("Gzipping CBOR data has been skipped");
    }
    // total file counts
    Map<String, Integer> typeCounts = new HashMap<>();
    // filtered file counters
    Map<String, Integer> filteredCounts = new HashMap<>();

    Configuration nutchConfig = NutchConfiguration.create();
    Path segmentRootPath = new Path(segmentRootDir.toString());
    FileSystem fs = segmentRootPath.getFileSystem(nutchConfig);

    //get all paths
    List<Path> parts = new ArrayList<>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(segmentRootPath, true);
    String partPattern = ".*" + File.separator + Content.DIR_NAME + File.separator + "part-[0-9]{5}"
            + File.separator + "data";
    while (files.hasNext()) {
        LocatedFileStatus next = files.next();
        if (next.isFile()) {
            Path path = next.getPath();
            if (path.toString().matches(partPattern)) {
                parts.add(path);
            }
        }
    }

    LinkDbReader linkDbReader = null;
    if (linkdb != null) {
        linkDbReader = new LinkDbReader(nutchConfig, new Path(linkdb.toString()));
    }
    if (parts == null || parts.size() == 0) {
        LOG.error("No segment directories found in {} ", segmentRootDir.getAbsolutePath());
        System.exit(1);
    }
    LOG.info("Found {} segment parts", parts.size());
    if (gzip && !warc) {
        fileList = new ArrayList<>();
        constructNewStream(outputDir);
    }

    for (Path segmentPart : parts) {
        LOG.info("Processing segment Part : [ {} ]", segmentPart);
        try {
            SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig,
                    SequenceFile.Reader.file(segmentPart));

            Writable key = (Writable) reader.getKeyClass().getConstructor().newInstance();

            Content content = null;
            while (reader.next(key)) {
                content = new Content();
                reader.getCurrentValue(content);
                Metadata metadata = content.getMetadata();
                String url = key.toString();

                String baseName = FilenameUtils.getBaseName(url);
                String extensionName = FilenameUtils.getExtension(url);

                if (!extension.isEmpty()) {
                    extensionName = extension;
                } else if ((extensionName == null) || extensionName.isEmpty()) {
                    extensionName = "html";
                }

                String outputFullPath = null;
                String outputRelativePath = null;
                String filename = null;
                String timestamp = null;
                String reverseKey = null;

                if (epochFilename || config.getReverseKey()) {
                    try {
                        long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z")
                                .parse(getDate(metadata.get("Date"))).getTime();
                        timestamp = String.valueOf(epoch);
                    } catch (ParseException pe) {
                        LOG.warn(pe.getMessage());
                    }

                    reverseKey = reverseUrl(url);
                    config.setReverseKeyValue(
                            reverseKey.replace("/", "_") + "_" + DigestUtils.sha1Hex(url) + "_" + timestamp);
                }

                if (!warc) {
                    if (epochFilename) {
                        outputFullPath = DumpFileUtil.createFileNameFromUrl(outputDir.getAbsolutePath(),
                                reverseKey, url, timestamp, extensionName, !gzip);
                        outputRelativePath = outputFullPath.substring(0,
                                outputFullPath.lastIndexOf(File.separator) - 1);
                        filename = content.getMetadata().get(Metadata.DATE) + "." + extensionName;
                    } else {
                        String md5Ofurl = DumpFileUtil.getUrlMD5(url);
                        String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(),
                                md5Ofurl, !gzip);
                        filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extensionName);
                        outputFullPath = String.format("%s/%s", fullDir, filename);

                        String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
                        String firstLevelDirName = fullPathLevels[fullPathLevels.length - 2];
                        String secondLevelDirName = fullPathLevels[fullPathLevels.length - 1];
                        outputRelativePath = firstLevelDirName + secondLevelDirName;
                    }
                }
                // Encode all filetypes if no mimetypes have been given
                Boolean filter = (mimeTypes == null);

                String jsonData = "";
                try {
                    String mimeType = new Tika().detect(content.getContent());
                    // Maps file to JSON-based structure

                    Set<String> inUrls = null; //there may be duplicates, so using set
                    if (linkDbReader != null) {
                        Inlinks inlinks = linkDbReader.getInlinks((Text) key);
                        if (inlinks != null) {
                            Iterator<Inlink> iterator = inlinks.iterator();
                            inUrls = new LinkedHashSet<>();
                            while (inUrls.size() <= MAX_INLINKS && iterator.hasNext()) {
                                inUrls.add(iterator.next().getFromUrl());
                            }
                        }
                    }
                    //TODO: Make this Jackson Format implementation reusable
                    try (CommonCrawlFormat format = CommonCrawlFormatFactory
                            .getCommonCrawlFormat(warc ? "WARC" : "JACKSON", nutchConfig, config)) {
                        if (inUrls != null) {
                            format.setInLinks(new ArrayList<>(inUrls));
                        }
                        jsonData = format.getJsonData(url, content, metadata);
                    }

                    collectStats(typeCounts, mimeType);
                    // collects statistics for the given mimetypes
                    if ((mimeType != null) && (mimeTypes != null)
                            && Arrays.asList(mimeTypes).contains(mimeType)) {
                        collectStats(filteredCounts, mimeType);
                        filter = true;
                    }
                } catch (IOException ioe) {
                    LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
                    return;
                }

                if (!warc) {
                    if (filter) {
                        byte[] byteData = serializeCBORData(jsonData);

                        if (!gzip) {
                            File outputFile = new File(outputFullPath);
                            if (outputFile.exists()) {
                                LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
                            } else {
                                LOG.info("Writing: [" + outputFullPath + "]");
                                IOUtils.copy(new ByteArrayInputStream(byteData),
                                        new FileOutputStream(outputFile));
                            }
                        } else {
                            if (fileList.contains(outputFullPath)) {
                                LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
                            } else {
                                fileList.add(outputFullPath);
                                LOG.info("Compressing: [" + outputFullPath + "]");
                                //TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
                                TarArchiveEntry tarEntry = new TarArchiveEntry(
                                        outputRelativePath + File.separator + filename);
                                tarEntry.setSize(byteData.length);
                                tarOutput.putArchiveEntry(tarEntry);
                                tarOutput.write(byteData);
                                tarOutput.closeArchiveEntry();
                            }
                        }
                    }
                }
            }
            reader.close();
        } catch (Exception e) {
            LOG.warn("SKIPPED: {} Because : {}", segmentPart, e.getMessage());
        } finally {
            fs.close();
        }
    }

    if (gzip && !warc) {
        closeStream();
    }

    if (!typeCounts.isEmpty()) {
        LOG.info("CommonsCrawlDataDumper File Stats: "
                + DumpFileUtil.displayFileTypes(typeCounts, filteredCounts));
    }

}

From source file:org.apache.phoenix.compile.ListJarsQueryPlan.java

License:Apache License

@Override
public ResultIterator iterator(ParallelScanGrouper scanGrouper) throws SQLException {
    return new ResultIterator() {
        private RemoteIterator<LocatedFileStatus> listFiles = null;

        @Override/*w ww  .  j  a va  2  s .co m*/
        public void close() throws SQLException {

        }

        @Override
        public Tuple next() throws SQLException {
            try {
                if (first) {
                    String dynamicJarsDir = stmt.getConnection().getQueryServices().getProps()
                            .get(QueryServices.DYNAMIC_JARS_DIR_KEY);
                    if (dynamicJarsDir == null) {
                        throw new SQLException(QueryServices.DYNAMIC_JARS_DIR_KEY
                                + " is not configured for the listing the jars.");
                    }
                    dynamicJarsDir = dynamicJarsDir.endsWith("/") ? dynamicJarsDir : dynamicJarsDir + '/';
                    Configuration conf = HBaseFactoryProvider.getConfigurationFactory().getConfiguration();
                    Path dynamicJarsDirPath = new Path(dynamicJarsDir);
                    FileSystem fs = dynamicJarsDirPath.getFileSystem(conf);
                    listFiles = fs.listFiles(dynamicJarsDirPath, true);
                    first = false;
                }
                if (listFiles == null || !listFiles.hasNext())
                    return null;
                ImmutableBytesWritable ptr = new ImmutableBytesWritable();
                ParseNodeFactory factory = new ParseNodeFactory();
                LiteralParseNode literal = factory.literal(listFiles.next().getPath().toString());
                LiteralExpression expression = LiteralExpression.newConstant(literal.getValue(),
                        PVarchar.INSTANCE, Determinism.ALWAYS);
                expression.evaluate(null, ptr);
                byte[] rowKey = ByteUtil.copyKeyBytesIfNecessary(ptr);
                Cell cell = CellUtil.createCell(rowKey, HConstants.EMPTY_BYTE_ARRAY,
                        HConstants.EMPTY_BYTE_ARRAY, EnvironmentEdgeManager.currentTimeMillis(),
                        Type.Put.getCode(), HConstants.EMPTY_BYTE_ARRAY);
                List<Cell> cells = new ArrayList<Cell>(1);
                cells.add(cell);
                return new ResultTuple(Result.create(cells));
            } catch (IOException e) {
                throw new SQLException(e);
            }
        }

        @Override
        public void explain(List<String> planSteps) {
        }
    };
}

From source file:org.apache.ranger.services.hdfs.HDFSRangerTest.java

License:Apache License

@org.junit.Test
public void executeTest() throws Exception {
    FileSystem fileSystem = hdfsCluster.getFileSystem();

    // Write a file - the AccessControlEnforcer won't be invoked as we are the "superuser"
    final Path file = new Path("/tmp/tmpdir3/data-file2");
    FSDataOutputStream out = fileSystem.create(file);
    for (int i = 0; i < 1024; ++i) {
        out.write(("data" + i + "\n").getBytes("UTF-8"));
        out.flush();//from w ww. j  ava  2s.c o m
    }
    out.close();

    // Change permissions to read-only
    fileSystem.setPermission(file, new FsPermission(FsAction.READ, FsAction.NONE, FsAction.NONE));

    // Change the parent directory permissions to be execute only for the owner
    Path parentDir = new Path("/tmp/tmpdir3");
    fileSystem.setPermission(parentDir, new FsPermission(FsAction.EXECUTE, FsAction.NONE, FsAction.NONE));

    // Try to read the directory as "bob" - this should be allowed (by the policy - user)
    UserGroupInformation ugi = UserGroupInformation.createUserForTesting("bob", new String[] {});
    ugi.doAs(new PrivilegedExceptionAction<Void>() {

        public Void run() throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", defaultFs);

            FileSystem fs = FileSystem.get(conf);

            RemoteIterator<LocatedFileStatus> iter = fs.listFiles(file.getParent(), false);
            Assert.assertTrue(iter.hasNext());

            fs.close();
            return null;
        }
    });

    // Try to read the directory as "alice" - this should be allowed (by the policy - group)
    ugi = UserGroupInformation.createUserForTesting("alice", new String[] { "IT" });
    ugi.doAs(new PrivilegedExceptionAction<Void>() {

        public Void run() throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", defaultFs);

            FileSystem fs = FileSystem.get(conf);

            RemoteIterator<LocatedFileStatus> iter = fs.listFiles(file.getParent(), false);
            Assert.assertTrue(iter.hasNext());

            fs.close();
            return null;
        }
    });

    // Now try to read the directory as unknown user "eve" - this should not be allowed
    ugi = UserGroupInformation.createUserForTesting("eve", new String[] {});
    ugi.doAs(new PrivilegedExceptionAction<Void>() {

        public Void run() throws Exception {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", defaultFs);

            FileSystem fs = FileSystem.get(conf);

            // Write to the file
            try {
                RemoteIterator<LocatedFileStatus> iter = fs.listFiles(file.getParent(), false);
                Assert.assertTrue(iter.hasNext());
                Assert.fail("Failure expected on an incorrect permission");
            } catch (RemoteException ex) {
                // expected
                Assert.assertTrue(RangerAccessControlException.class.getName().equals(ex.getClassName()));
            }

            fs.close();
            return null;
        }
    });

}

From source file:org.apache.sentry.tests.e2e.hdfs.TestHDFSIntegration.java

License:Apache License

private void verifyAccessToPath(String user, String group, String path, boolean hasPermission)
        throws Exception {
    Path p = new Path(path);
    UserGroupInformation hadoopUser = UserGroupInformation.createUserForTesting(user, new String[] { group });
    FileSystem fs = DFSTestUtil.getFileSystemAs(hadoopUser, hadoopConf);
    try {//from  w ww.  ja v  a2  s .  c om
        fs.listFiles(p, true);
        if (!hasPermission) {
            Assert.assertFalse("Expected listing files to fail", false);
        }
    } catch (Exception e) {
        if (hasPermission) {
            throw e;
        }
    }
}

From source file:org.apache.sentry.tests.e2e.hdfs.TestHDFSIntegrationBase.java

License:Apache License

protected void verifyAccessToPath(String user, String group, String path, boolean hasPermission)
        throws Exception {
    Path p = new Path(path);
    FileSystem fs = miniDFS.getFileSystem();
    try {/*from w  w  w.  java 2  s . co  m*/
        fs.listFiles(p, true);
        if (!hasPermission) {
            assertFalse("Expected listing files to fail", false);
        }
    } catch (Exception e) {
        if (hasPermission) {
            throw e;
        }
    }
}

From source file:org.apache.tajo.engine.planner.TestPlannerUtil.java

License:Apache License

@Test
public void testGetNonZeroLengthDataFiles() throws Exception {
    String queryFiles = ClassLoader.getSystemResource("queries").toString() + "/TestSelectQuery";
    Path path = new Path(queryFiles);

    TableDesc tableDesc = new TableDesc();
    tableDesc.setName("Test");
    tableDesc.setPath(path.toUri());// w  ww.  j a  v  a  2s  . c om

    FileSystem fs = path.getFileSystem(util.getConfiguration());

    List<Path> expectedFiles = new ArrayList<Path>();
    RemoteIterator<LocatedFileStatus> files = fs.listFiles(path, true);
    while (files.hasNext()) {
        LocatedFileStatus file = files.next();
        if (file.isFile() && file.getLen() > 0) {
            expectedFiles.add(file.getPath());
        }
    }
    int fileNum = expectedFiles.size() / 5;

    int numResultFiles = 0;
    for (int i = 0; i <= 5; i++) {
        int start = i * fileNum;

        FragmentProto[] fragments = PhysicalPlanUtil.getNonZeroLengthDataFiles(util.getConfiguration(),
                tableDesc, start, fileNum);
        assertNotNull(fragments);

        numResultFiles += fragments.length;
        int expectedSize = fileNum;
        if (i == 5) {
            //last
            expectedSize = expectedFiles.size() - (fileNum * 5);
        }

        comparePath(expectedFiles, fragments, start, expectedSize);
    }

    assertEquals(expectedFiles.size(), numResultFiles);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.AbstractHadoopGraphComputer.java

License:Apache License

public static File copyDirectoryIfNonExistent(final FileSystem fileSystem, final String directory) {
    try {//  w w  w .j  a  v  a2 s  . c  o  m
        final String hadoopGremlinLibsRemote = "hadoop-gremlin-" + Gremlin.version() + "-libs";
        final Path path = new Path(directory);
        if (Boolean.valueOf(System.getProperty("is.testing", "false"))
                || (fileSystem.exists(path) && fileSystem.isDirectory(path))) {
            final File tempDirectory = new File(
                    System.getProperty("java.io.tmpdir") + File.separator + hadoopGremlinLibsRemote);
            assert tempDirectory.exists() || tempDirectory.mkdirs();
            final String tempPath = tempDirectory.getAbsolutePath() + File.separator + path.getName();
            final RemoteIterator<LocatedFileStatus> files = fileSystem.listFiles(path, false);
            while (files.hasNext()) {
                final LocatedFileStatus f = files.next();
                fileSystem.copyToLocalFile(false, f.getPath(),
                        new Path(tempPath + System.getProperty("file.separator") + f.getPath().getName()),
                        true);
            }
            return new File(tempPath);
        } else
            return new File(directory);
    } catch (final IOException e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

From source file:org.apache.vxquery.metadata.VXQueryCollectionOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions)
        throws HyracksDataException {
    final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(),
            recordDescProvider.getInputRecordDescriptor(getActivityId(), 0));
    final int fieldOutputCount = recordDescProvider.getOutputRecordDescriptor(getActivityId(), 0)
            .getFieldCount();/*from w  ww .  j av  a 2 s.  co  m*/
    final ByteBuffer frame = ctx.allocateFrame();
    final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize(), fieldOutputCount);
    final short partitionId = (short) ctx.getTaskAttemptId().getTaskId().getPartition();
    final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider(partitionId, dataSourceId,
            totalDataSources);
    final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId();
    final DynamicContext dCtx = (DynamicContext) ctx.getJobletContext().getGlobalJobData();

    final String collectionName = collectionPartitions[partition % collectionPartitions.length];
    final XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId, frame, appender, childSeq,
            dCtx.getStaticContext());

    return new AbstractUnaryInputUnaryOutputOperatorNodePushable() {
        @Override
        public void open() throws HyracksDataException {
            appender.reset(frame, true);
            writer.open();
            hdfs = new HDFSFunctions();
        }

        @Override
        public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
            fta.reset(buffer);
            String collectionModifiedName = collectionName.replace("${nodeId}", nodeId);
            if (!collectionModifiedName.contains("hdfs:/")) {
                File collectionDirectory = new File(collectionModifiedName);
                //check if directory is in the local file system
                if (collectionDirectory.exists()) {
                    // Go through each tuple.
                    if (collectionDirectory.isDirectory()) {
                        for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                            Iterator<File> it = FileUtils.iterateFiles(collectionDirectory,
                                    new VXQueryIOFileFilter(), TrueFileFilter.INSTANCE);
                            while (it.hasNext()) {
                                File xmlDocument = it.next();
                                if (LOGGER.isLoggable(Level.FINE)) {
                                    LOGGER.fine(
                                            "Starting to read XML document: " + xmlDocument.getAbsolutePath());
                                }
                                parser.parseElements(xmlDocument, writer, fta, tupleIndex);
                            }
                        }
                    } else {
                        throw new HyracksDataException("Invalid directory parameter (" + nodeId + ":"
                                + collectionDirectory.getAbsolutePath() + ") passed to collection.");
                    }
                }
            } else {
                // Else check in HDFS file system
                // Get instance of the HDFS filesystem
                FileSystem fs = hdfs.getFileSystem();
                if (fs != null) {
                    collectionModifiedName = collectionModifiedName.replaceAll("hdfs:/", "");
                    Path directory = new Path(collectionModifiedName);
                    Path xmlDocument;
                    if (tag != null) {
                        hdfs.setJob(directory.getName(), tag);
                        tag = "<" + tag + ">";
                        Job job = hdfs.getJob();
                        InputFormat inputFormat = hdfs.getinputFormat();
                        try {
                            hdfs.scheduleSplits();
                            ArrayList<Integer> schedule = hdfs
                                    .getScheduleForNode(InetAddress.getLocalHost().getHostName());
                            List<InputSplit> splits = hdfs.getSplits();
                            List<FileSplit> fileSplits = new ArrayList<FileSplit>();
                            for (int i : schedule) {
                                fileSplits.add((FileSplit) splits.get(i));
                            }
                            FileSplitsFactory splitsFactory = new FileSplitsFactory(fileSplits);
                            List<FileSplit> inputSplits = splitsFactory.getSplits();
                            ContextFactory ctxFactory = new ContextFactory();
                            int size = inputSplits.size();
                            InputStream stream;
                            String value;
                            RecordReader reader;
                            TaskAttemptContext context;
                            for (int i = 0; i < size; i++) {
                                //read split
                                context = ctxFactory.createContext(job.getConfiguration(), i);
                                try {
                                    reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                                    reader.initialize(inputSplits.get(i), context);
                                    while (reader.nextKeyValue()) {
                                        value = reader.getCurrentValue().toString();
                                        //Split value if it contains more than one item with the tag
                                        if (StringUtils.countMatches(value, tag) > 1) {
                                            String items[] = value.split(tag);
                                            for (String item : items) {
                                                if (item.length() > 0) {
                                                    item = START_TAG + tag + item;
                                                    stream = new ByteArrayInputStream(
                                                            item.getBytes(StandardCharsets.UTF_8));
                                                    parser.parseHDFSElements(stream, writer, fta, i);
                                                }
                                            }
                                        } else {
                                            value = START_TAG + value;
                                            //create an input stream to the file currently reading and send it to parser
                                            stream = new ByteArrayInputStream(
                                                    value.getBytes(StandardCharsets.UTF_8));
                                            parser.parseHDFSElements(stream, writer, fta, i);
                                        }
                                    }

                                } catch (InterruptedException e) {
                                    if (LOGGER.isLoggable(Level.SEVERE)) {
                                        LOGGER.severe(e.getMessage());
                                    }
                                }
                            }

                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (ParserConfigurationException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (SAXException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    } else {
                        try {
                            //check if the path exists and is a directory
                            if (fs.exists(directory) && fs.isDirectory(directory)) {
                                for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                                    //read every file in the directory
                                    RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true);
                                    while (it.hasNext()) {
                                        xmlDocument = it.next().getPath();
                                        if (fs.isFile(xmlDocument)) {
                                            if (LOGGER.isLoggable(Level.FINE)) {
                                                LOGGER.fine("Starting to read XML document: "
                                                        + xmlDocument.getName());
                                            }
                                            //create an input stream to the file currently reading and send it to parser
                                            InputStream in = fs.open(xmlDocument).getWrappedStream();
                                            parser.parseHDFSElements(in, writer, fta, tupleIndex);
                                        }
                                    }
                                }
                            } else {
                                throw new HyracksDataException("Invalid HDFS directory parameter (" + nodeId
                                        + ":" + directory + ") passed to collection.");
                            }
                        } catch (FileNotFoundException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    }
                    try {
                        fs.close();
                    } catch (IOException e) {
                        if (LOGGER.isLoggable(Level.SEVERE)) {
                            LOGGER.severe(e.getMessage());
                        }
                    }
                }
            }
        }

        @Override
        public void fail() throws HyracksDataException {
            writer.fail();
        }

        @Override
        public void close() throws HyracksDataException {
            // Check if needed?
            fta.reset(frame);
            if (fta.getTupleCount() > 0) {
                FrameUtils.flushFrame(frame, writer);
            }
            writer.close();
        }
    };
}

From source file:org.datacleaner.spark.ApplicationDriver.java

License:Open Source License

private List<String> buildJarFiles(MutableRef<String> primaryJarRef) throws IOException {
    final List<String> list = new ArrayList<>();

    final Configuration conf = new Configuration();
    conf.set("fs.defaultFS", "hdfs://" + _hostname + ":" + _port);

    final FileSystem fs = FileSystem.newInstance(conf);
    try {//from w  ww.ja v a  2  s. c  o m
        final Path directoryPath = new Path(_jarDirectoryPath);
        final RemoteIterator<LocatedFileStatus> files = fs.listFiles(directoryPath, false);
        while (files.hasNext()) {
            final LocatedFileStatus file = files.next();
            final Path path = file.getPath();
            final String filename = path.getName();
            if (filename.startsWith(PRIMARY_JAR_FILENAME_PREFIX)) {
                primaryJarRef.set(path.toString());
            } else {
                list.add(path.toString());
            }
        }
    } finally {
        FileHelper.safeClose(fs);
    }

    if (primaryJarRef.get() == null) {
        throw new IllegalArgumentException("Failed to find primary jar (starting with '"
                + PRIMARY_JAR_FILENAME_PREFIX + "') in JAR file directory: " + _jarDirectoryPath);
    }

    return list;
}

From source file:org.deeplearning4j.datasets.DatasetReaderFromHdfs.java

License:Apache License

protected void doInitialize() {
    FileSystem fs = CommonUtils.openHdfsConnect();
    try {/*w w w. ja va2s.  c  o m*/
        if (train) {
            hdfsIter = fs.listFiles(new Path(CommonUtils.TRAIN_HDFS_PATH), true);
        } else {
            hdfsIter = fs.listFiles(new Path(CommonUtils.VALIDATE_HDFS_PATH), true);
        }
        while (hdfsIter.hasNext()) {
            LocatedFileStatus next = hdfsIter.next();
            Path path = next.getPath();
            String currentPath = path.toUri().getPath();
            fileNames.add(path.toString());
            String name = FilenameUtils.getBaseName((new File(currentPath)).getParent());
            if (!labels.contains(name)) {
                labels.add(name);
            }

        }
        Collections.shuffle(fileNames);
        fileIterator = fileNames.iterator();
        numExample = fileNames.size();
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        CommonUtils.closeHdfsConnect(fs);
    }
}