Example usage for org.apache.hadoop.fs FileStatus getPath

List of usage examples for org.apache.hadoop.fs FileStatus getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus getPath.

Prototype

public Path getPath() 

Source Link

Usage

From source file:co.cask.cdap.data.tools.ReplicationStatusTool.java

License:Apache License

private static void addAllDirFiles(Path filePath, FileSystem fs, List<String> fileList) throws IOException {
    FileStatus[] fileStatus = fs.listStatus(filePath);
    for (FileStatus fileStat : fileStatus) {
        if (fileStat.isDirectory()) {
            addAllDirFiles(fileStat.getPath(), fs, fileList);
        } else {//from   w  ww.  j a v  a  2s  .  com
            fileList.add(fileStat.getPath().toString());
        }
    }
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);

    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName);
    Partitioning partitioning = outputDataset.getPartitioning();

    Set<PartitionKey> partitionsToAdd = new HashSet<>();
    Set<String> relativePaths = new HashSet<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);

            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path,
                        relativePath);//  w w w.j a v a  2 s .  c  o m
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            String fileName = relativePath.substring(lastPathSepIdx + 1);

            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            Path finalPath = new Path(finalDir, fileName);
            if (fs.exists(finalPath)) {
                throw new FileAlreadyExistsException("Final output path " + finalPath + " already exists");
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.add(partitionKey);
            relativePaths.add(relativeDir);
        }
    }

    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileSystem fs = finalOutput.getFileSystem(configuration);
    for (FileStatus stat : getAllCommittedTaskPaths(context)) {
        mergePaths(fs, stat, finalOutput);
    }

    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(),
            PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);

    // create all the necessary partitions
    for (PartitionKey partitionKey : partitionsToAdd) {
        PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
        partitionOutput.setMetadata(metadata);
        partitionOutput.addPartition();
    }

    // close the TaskContext, which flushes dataset operations
    try {
        taskContext.flushOperations();
    } catch (Exception e) {
        Throwables.propagateIfPossible(e, IOException.class);
        throw new IOException(e);
    }

    // delete the job-specific _temporary folder and create a _done file in the o/p folder
    cleanupJob(context);

    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : relativePaths) {
            Path pathToMark = new Path(finalOutput, relativePath);
            Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
            fs.createNewFile(markerPath);
        }
    }
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java

License:Apache License

/**
 * Merge two paths together.  Anything in from will be moved into to, if there
 * are any name conflicts while merging the files or directories in from win.
 * @param fs the File System to use//  ww w  .j a v  a  2  s . c  om
 * @param from the path data is coming from.
 * @param to the path data is going to.
 * @throws IOException on any error
 */
private void mergePaths(FileSystem fs, final FileStatus from, final Path to) throws IOException {
    if (from.isFile()) {
        if (fs.exists(to)) {
            if (!fs.delete(to, true)) {
                throw new IOException("Failed to delete " + to);
            }
        }

        if (!fs.rename(from.getPath(), to)) {
            throw new IOException("Failed to rename " + from + " to " + to);
        }
    } else if (from.isDirectory()) {
        if (fs.exists(to)) {
            FileStatus toStat = fs.getFileStatus(to);
            if (!toStat.isDirectory()) {
                if (!fs.delete(to, true)) {
                    throw new IOException("Failed to delete " + to);
                }
                if (!fs.rename(from.getPath(), to)) {
                    throw new IOException("Failed to rename " + from + " to " + to);
                }
            } else {
                //It is a directory so merge everything in the directories
                for (FileStatus subFrom : fs.listStatus(from.getPath())) {
                    Path subTo = new Path(to, subFrom.getPath().getName());
                    mergePaths(fs, subFrom, subTo);
                }
            }
        } else {
            //it does not exist just rename
            if (!fs.rename(from.getPath(), to)) {
                throw new IOException("Failed to rename " + from + " to " + to);
            }
        }
    }
}

From source file:co.cask.hydrator.plugin.db.batch.action.VerticaBulkImportAction.java

License:Apache License

@Override
public void run(ActionContext context) throws Exception {
    Object driver = Class.forName("com.vertica.jdbc.Driver").newInstance();
    DriverManager.registerDriver((Driver) driver);

    Preconditions.checkArgument(tableExists(config.tableName),
            "Table %s does not exist. Please check that the 'tableName' property "
                    + "has been set correctly, and that the connection string %s points to a valid database.",
            config.tableName, config.connectionString);

    String copyStatement;//from w  ww  .j a va  2s  .  c o  m

    if (config.level.equalsIgnoreCase("basic")) {
        // COPY tableName FROM STDIN DELIMITER 'delimiter'
        copyStatement = String.format("COPY %s FROM STDIN DELIMITER '%s'", config.tableName, config.delimiter);
    } else {
        copyStatement = config.copyStatement;
    }

    LOG.debug("Copy statement is: {}", copyStatement);

    try {
        try (Connection connection = DriverManager.getConnection(config.connectionString, config.user,
                config.password)) {
            connection.setAutoCommit(false);
            // run Copy statement
            VerticaCopyStream stream = new VerticaCopyStream((VerticaConnection) connection, copyStatement);
            // Keep running count of the number of rejects
            int totalRejects = 0;

            // start() starts the stream process, and opens the COPY command.
            stream.start();

            FileSystem fs = FileSystem.get(new Configuration());

            List<String> fileList = new ArrayList<>();
            FileStatus[] fileStatus;
            try {
                fileStatus = fs.listStatus(new Path(config.path));
                for (FileStatus fileStat : fileStatus) {
                    fileList.add(fileStat.getPath().toString());
                }
            } catch (FileNotFoundException e) {
                throw new IllegalArgumentException(String.format(String.format(
                        "Path %s not found on file system. Please provide correct path.", config.path), e));
            }

            if (fileStatus.length <= 0) {
                LOG.warn("No files available to load into vertica database");
            }

            for (String file : fileList) {
                Path path = new Path(file);

                FSDataInputStream inputStream = fs.open(path);
                // Add stream to the VerticaCopyStream
                stream.addStream(inputStream);

                // call execute() to load the newly added stream. You could
                // add many streams and call execute once to load them all.
                // Which method you choose depends mainly on whether you want
                // the ability to check the number of rejections as the load
                // progresses so you can stop if the number of rejects gets too
                // high. Also, high numbers of InputStreams could create a
                // resource issue on your client system.
                stream.execute();

                // Show any rejects from this execution of the stream load
                // getRejects() returns a List containing the
                // row numbers of rejected rows.
                List<Long> rejects = stream.getRejects();

                // The size of the list gives you the number of rejected rows.
                int numRejects = rejects.size();
                totalRejects += numRejects;
                if (config.autoCommit.equalsIgnoreCase("true")) {
                    // Commit the loaded data
                    connection.commit();
                }
            }

            // Finish closes the COPY command. It returns the number of
            // rows inserted.
            long results = stream.finish();

            context.getMetrics().gauge("num.of.rows.rejected", totalRejects);
            context.getMetrics().gauge("num.of.rows.inserted", results);

            // Commit the loaded data
            connection.commit();
        }
    } catch (Exception e) {
        throw new RuntimeException(String.format("Exception while running copy statement %s", copyStatement),
                e);
    } finally {
        DriverManager.deregisterDriver((Driver) driver);
    }
}

From source file:co.cask.tephra.persist.HDFSTransactionStateStorage.java

License:Apache License

@Override
public List<String> listSnapshots() throws IOException {
    FileStatus[] files = fs.listStatus(snapshotDir, SNAPSHOT_FILE_FILTER);
    return Lists.transform(Arrays.asList(files), new Function<FileStatus, String>() {
        @Nullable//ww w.  j  ava2s  . co  m
        @Override
        public String apply(@Nullable FileStatus input) {
            return input.getPath().getName();
        }
    });
}

From source file:co.cask.tephra.persist.HDFSTransactionStateStorage.java

License:Apache License

@Override
public void deleteLogsOlderThan(long timestamp) throws IOException {
    FileStatus[] statuses = fs.listStatus(snapshotDir, new LogFileFilter(0, timestamp));
    int removedCnt = 0;
    for (FileStatus status : statuses) {
        LOG.debug("Removing old transaction log {}", status.getPath());
        if (fs.delete(status.getPath(), false)) {
            removedCnt++;//  w  w  w . j a  v a  2 s. c o  m
        } else {
            LOG.error("Failed to delete transaction log file {}", status.getPath());
        }
    }
    LOG.debug("Removed {} transaction logs older than {}", removedCnt, timestamp);
}

From source file:co.cask.tephra.persist.HDFSTransactionStateStorage.java

License:Apache License

@Override
public List<String> listLogs() throws IOException {
    FileStatus[] files = fs.listStatus(snapshotDir, new LogFileFilter(0, Long.MAX_VALUE));
    return Lists.transform(Arrays.asList(files), new Function<FileStatus, String>() {
        @Nullable//from   w  w  w  .j  a v  a  2  s .  co m
        @Override
        public String apply(@Nullable FileStatus input) {
            return input.getPath().getName();
        }
    });
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithDelimitedTextInputFormat() throws Exception {
    final String inputData1 = "Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney\n"
            + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n"
            + "Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson";
    final String inputData2 = "Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos\n"
            + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n"
            + "Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt");
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");
    String[] args = new String[] { "-inputFormat", "co.nubetech.hiho.dedup.DelimitedTextInputFormat",
            "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName",
            "org.apache.hadoop.io.Text", "-inputPath", "/input1,/input2", "-outputPath", "output", "-delimeter",
            ",", "-column", "1", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);/*from ww w . j a v a  2s . c om*/
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney");
    expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos");
    expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein");
    expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson");
    expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(5, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithTextInputFormat() throws Exception {
    final String inputData1 = "Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney\n"
            + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n"
            + "Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson";
    final String inputData2 = "Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos\n"
            + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n"
            + "Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson";
    createTextFileInHDFS(inputData1, "/input1", "testFile1.txt");
    createTextFileInHDFS(inputData2, "/input2", "testFile2.txt");
    String[] args = new String[] { "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
            "-inputPath", "/input1,/input2", "-outputPath", "output", "-outputFormat",
            "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);/*from   ww w .  j  av  a2 s  . co m*/
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney");
    expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos");
    expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein");
    expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson");
    expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(5, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithSequenceFileAsTextInputFormat() throws Exception {
    HashMap<Text, Text> inputData1 = new HashMap<Text, Text>();
    inputData1.put(new Text("1"),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new Text("2"),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new Text("3"),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<Text, Text> inputData2 = new HashMap<Text, Text>();
    inputData2.put(new Text("1"),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new Text("2"),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new Text("4"),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-outputFormat",
            "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName",
            "org.apache.hadoop.io.Text", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);//from   w w w. j a v a  2 s. co m
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney");
    expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson");
    expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein");
    expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos");
    expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(5, count);
}