Example usage for org.apache.hadoop.fs Path toUri

List of usage examples for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri() 

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.twitter.hraven.etl.JobFilePreprocessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    // When we started processing. This is also the upper limit of files we
    // accept, next run will pick up the new incoming files.
    long processingStartMillis = System.currentTimeMillis();

    Configuration hbaseConf = HBaseConfiguration.create(getConf());

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Output should be an hdfs path.
    FileSystem hdfs = FileSystem.get(hbaseConf);

    // Grab the input path argument
    String output = commandLine.getOptionValue("o");
    LOG.info(" output=" + output);
    Path outputPath = new Path(output);
    FileStatus outputFileStatus = hdfs.getFileStatus(outputPath);

    if (!outputFileStatus.isDir()) {
        throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName());
    }/*from   ww w.  j  a  v a  2 s.  c o m*/

    // Grab the input path argument
    String input;
    if (commandLine.hasOption("i")) {
        input = commandLine.getOptionValue("i");
    } else {
        input = hbaseConf.get("mapred.job.tracker.history.completed.location");
    }
    LOG.info("input=" + input);

    // Grab the batch-size argument
    int batchSize;
    if (commandLine.hasOption("b")) {
        try {
            batchSize = Integer.parseInt(commandLine.getOptionValue("b"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe);
        }
        // Additional check
        if (batchSize < 1) {
            throw new IllegalArgumentException(
                    "Cannot process files in batches smaller than 1. Specified batch size option -b is: "
                            + commandLine.getOptionValue("b"));
        }
    } else {
        batchSize = DEFAULT_BATCH_SIZE;
    }

    boolean forceAllFiles = commandLine.hasOption("f");
    LOG.info("forceAllFiles: " + forceAllFiles);

    Path inputPath = new Path(input);
    FileStatus inputFileStatus = hdfs.getFileStatus(inputPath);

    if (!inputFileStatus.isDir()) {
        throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName());
    }

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    /**
     * Grab the size of huge files to be moved argument
     * hbase cell can't store files bigger than
     * maxFileSize, hence no need to consider them for rawloading
     * Reference:
     * {@link https://github.com/twitter/hraven/issues/59}
     */
    String maxFileSizeStr = commandLine.getOptionValue("s");
    LOG.info("maxFileSize=" + maxFileSizeStr);
    long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT;
    try {
        maxFileSize = Long.parseLong(maxFileSizeStr);
    } catch (NumberFormatException nfe) {
        throw new ProcessingException(
                "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe);
    }

    ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf);

    boolean success = true;
    try {

        // Figure out where we last left off (if anywhere at all)
        ProcessRecord lastProcessRecord = null;

        if (!forceAllFiles) {
            lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster);
        }

        long minModificationTimeMillis = 0;
        if (lastProcessRecord != null) {
            // Start of this time period is the end of the last period.
            minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis();
        }

        // Do a sanity check. The end time of the last scan better not be later
        // than when we started processing.
        if (minModificationTimeMillis > processingStartMillis) {
            throw new RuntimeException("The last processing record has maxModificationMillis later than now: "
                    + lastProcessRecord);
        }

        // Accept only jobFiles and only those that fall in the desired range of
        // modification time.
        JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(
                hbaseConf, minModificationTimeMillis);

        String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis));

        ContentSummary contentSummary = hdfs.getContentSummary(inputPath);
        LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath
                + " that are modified since " + timestamp);

        // get the files in the done folder,
        // need to traverse dirs under done recursively for versions
        // that include MAPREDUCE-323: on/after hadoop 0.20.203.0
        // on/after cdh3u5
        FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath,
                jobFileModifiedRangePathFilter);

        LOG.info("Sorting " + jobFileStatusses.length + " job files.");

        Arrays.sort(jobFileStatusses, new FileStatusModificationComparator());

        // Process these files in batches at a time.
        int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize);
        LOG.info("Batch count: " + batchCount);
        for (int b = 0; b < batchCount; b++) {
            processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath);
        }

    } finally {
        processRecordService.close();
    }

    Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass());
    if (statistics != null) {
        LOG.info("HDFS bytes read: " + statistics.getBytesRead());
        LOG.info("HDFS bytes written: " + statistics.getBytesWritten());
        LOG.info("HDFS read ops: " + statistics.getReadOps());
        LOG.info("HDFS large read ops: " + statistics.getLargeReadOps());
        LOG.info("HDFS write ops: " + statistics.getWriteOps());
    }

    // Return the status
    return success ? 0 : 1;
}

From source file:com.twitter.hraven.etl.JobFileProcessor.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration hbaseConf = HBaseConfiguration.create(getConf());

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    // Number of parallel threads to use
    int threadCount = 1;
    if (commandLine.hasOption("t")) {
        try {/*from  ww w.  ja v a 2 s .  c  o m*/
            threadCount = Integer.parseInt(commandLine.getOptionValue("t"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "Provided thread-count argument (-t) is not a number: " + commandLine.getOptionValue("t"),
                    nfe);
        }
        if (threadCount < 1) {
            throw new IllegalArgumentException(
                    "Cannot run fewer than 1 thread. Provided thread-count argument (-t): " + threadCount);
        }
    }
    LOG.info("threadCount=" + threadCount);

    boolean reprocess = commandLine.hasOption("r");
    LOG.info("reprocess=" + reprocess);

    // Grab the batch-size argument
    int batchSize;
    if (commandLine.hasOption("b")) {
        try {
            batchSize = Integer.parseInt(commandLine.getOptionValue("b"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe);
        }
        // Additional check
        if (batchSize < 1) {
            throw new IllegalArgumentException(
                    "Cannot process files in batches smaller than 1. Specified batch size option -b is: "
                            + commandLine.getOptionValue("b"));
        }
    } else {
        batchSize = DEFAULT_BATCH_SIZE;
    }

    // Grab the costfile argument

    String costFilePath = commandLine.getOptionValue("zf");
    LOG.info("cost properties file on hdfs=" + costFilePath);
    if (costFilePath == null)
        costFilePath = Constants.COST_PROPERTIES_HDFS_DIR;
    Path hdfsPath = new Path(costFilePath + Constants.COST_PROPERTIES_FILENAME);
    // add to distributed cache
    DistributedCache.addCacheFile(hdfsPath.toUri(), hbaseConf);

    // Grab the machine type argument
    String machineType = commandLine.getOptionValue("m");
    // set it as part of conf so that the
    // hRaven job can access it in the mapper
    hbaseConf.set(Constants.HRAVEN_MACHINE_TYPE, machineType);

    // check if re-aggregate option is forced on
    // if yes, we need to aggregate for this job inspite of
    // job having aggregation done status in raw table
    boolean reAggregateFlagValue = false;
    if (commandLine.hasOption("ra")) {
        String reaggregateFlag = commandLine.getOptionValue("ra");
        // set it as part of conf so that the
        // hRaven jobProcessor can access it in the mapper
        if (StringUtils.isNotBlank(reaggregateFlag)) {
            LOG.info(" reaggregateFlag is: " + reaggregateFlag);
            if (StringUtils.equalsIgnoreCase(reaggregateFlag, Boolean.TRUE.toString())) {
                reAggregateFlagValue = true;
            }
        }
    }
    LOG.info(AggregationConstants.RE_AGGREGATION_FLAG_NAME + "=" + reAggregateFlagValue);
    hbaseConf.setBoolean(AggregationConstants.RE_AGGREGATION_FLAG_NAME, reAggregateFlagValue);

    // set aggregation to off by default
    boolean aggFlagValue = false;
    if (commandLine.hasOption("a")) {
        String aggregateFlag = commandLine.getOptionValue("a");
        // set it as part of conf so that the
        // hRaven jobProcessor can access it in the mapper
        if (StringUtils.isNotBlank(aggregateFlag)) {
            LOG.info(" aggregateFlag is: " + aggregateFlag);
            if (StringUtils.equalsIgnoreCase(aggregateFlag, Boolean.TRUE.toString())) {
                aggFlagValue = true;
            }
        }
    }
    if (reprocess) {
        // turn off aggregation if reprocessing is true
        // we don't want to inadvertently aggregate again while re-processing
        // re-aggregation needs to be a conscious setting
        aggFlagValue = false;
    }
    LOG.info(AggregationConstants.AGGREGATION_FLAG_NAME + "=" + aggFlagValue);
    hbaseConf.setBoolean(AggregationConstants.AGGREGATION_FLAG_NAME, aggFlagValue);

    String processFileSubstring = null;
    if (commandLine.hasOption("p")) {
        processFileSubstring = commandLine.getOptionValue("p");
    }
    LOG.info("processFileSubstring=" + processFileSubstring);

    // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have
    // history files exceeding that. Disable limit.
    hbaseConf.setInt("hbase.client.keyvalue.maxsize", 0);

    // Shove this into the jobConf so that we can get it out on the task side.
    hbaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster);

    boolean success = false;
    if (reprocess) {
        success = reProcessRecords(hbaseConf, cluster, batchSize, threadCount);
    } else {
        success = processRecords(hbaseConf, cluster, batchSize, threadCount, processFileSubstring);
    }

    // Return the status
    return success ? 0 : 1;
}

From source file:com.twitter.hraven.etl.TestFileLister.java

License:Apache License

@Test
public void testPruneFileListBySize() throws IOException {

    long maxFileSize = 20L;
    FileStatus[] origList = new FileStatus[2];
    FileSystem hdfs = FileSystem.get(UTIL.getConfiguration());
    Path inputPath = new Path("/inputdir_filesize");
    boolean os = hdfs.mkdirs(inputPath);
    assertTrue(os);//from  ww w . j  a  v a  2  s  .  c o  m
    assertTrue(hdfs.exists(inputPath));

    final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist";
    File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME);
    Path srcPath = new Path(jobHistoryfile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[0] = hdfs.getFileStatus(expPath);

    final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml";
    File jobConfFile = new File(JOB_CONF_FILE_NAME);
    srcPath = new Path(jobConfFile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[1] = hdfs.getFileStatus(expPath);

    FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 0);

    Path emptyFile = new Path(
            inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist");
    os = hdfs.createNewFile(emptyFile);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyFile));
    origList[0] = hdfs.getFileStatus(emptyFile);

    Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml");
    os = hdfs.createNewFile(emptyConfFile);

    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile));
    origList[1] = hdfs.getFileStatus(emptyConfFile);

    prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 2);

}

From source file:com.twitter.hraven.etl.TestFileLister.java

License:Apache License

/**
 * removes conf file which has already been put in prunedList
 *
 * @throws IOException/*from  w w  w  .  ja v  a2  s  .  com*/
 */
@Test
public void testPruneFileListRemovingConfFromPruneList() throws IOException {

    long maxFileSize = 20L;
    FileStatus[] origList = new FileStatus[2];
    FileSystem hdfs = FileSystem.get(UTIL.getConfiguration());
    Path inputPath = new Path("/inputdir_filesize_pruneList");
    boolean os = hdfs.mkdirs(inputPath);
    assertTrue(os);
    assertTrue(hdfs.exists(inputPath));

    Path relocationPath = new Path("/relocation_filesize_pruneList");
    os = hdfs.mkdirs(relocationPath);
    assertTrue(os);
    assertTrue(hdfs.exists(relocationPath));

    Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329348432655_0001_conf.xml");
    os = hdfs.createNewFile(emptyConfFile);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile));
    origList[0] = hdfs.getFileStatus(emptyConfFile);

    final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist";
    File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME);
    Path srcPath = new Path(jobHistoryfile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[1] = hdfs.getFileStatus(expPath);

    FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 0);
}

From source file:com.twitter.hraven.etl.TestFileLister.java

License:Apache License

/**
 * tests the case when several files are spread out in the dir and need to be removed
 *
 * @throws IOException// w w w  .  j a v a2  s  . c o m
 */
@Test
public void testPruneFileListMultipleFilesAlreadyMovedCases() throws IOException {

    long maxFileSize = 20L;
    FileStatus[] origList = new FileStatus[12];
    FileSystem hdfs = FileSystem.get(UTIL.getConfiguration());
    Path inputPath = new Path("/inputdir_filesize_multiple");
    boolean os = hdfs.mkdirs(inputPath);
    assertTrue(os);
    assertTrue(hdfs.exists(inputPath));

    Path relocationPath = new Path("/relocation_filesize_multiple");
    os = hdfs.mkdirs(relocationPath);
    assertTrue(os);
    assertTrue(hdfs.exists(relocationPath));

    Path emptyFile = new Path(
            inputPath.toUri() + "/" + "job_1329341111111_0101-1329111113227-user2-Sleep.jhist");
    os = hdfs.createNewFile(emptyFile);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyFile));
    origList[0] = hdfs.getFileStatus(emptyFile);

    Path emptyConfFile = new Path(inputPath.toUri() + "/" + "job_1329341111111_0101_conf.xml");
    os = hdfs.createNewFile(emptyConfFile);

    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile));
    origList[1] = hdfs.getFileStatus(emptyConfFile);

    final String JOB_HISTORY_FILE_NAME = "src/test/resources/job_1329348432655_0001-1329348443227-user-Sleep+job-1329348468601-10-1-SUCCEEDED-default.jhist";
    File jobHistoryfile = new File(JOB_HISTORY_FILE_NAME);
    Path srcPath = new Path(jobHistoryfile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    Path expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[2] = hdfs.getFileStatus(expPath);

    final String JOB_CONF_FILE_NAME = "src/test/resources/job_1329348432655_0001_conf.xml";
    File jobConfFile = new File(JOB_CONF_FILE_NAME);
    srcPath = new Path(jobConfFile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputPath);
    expPath = new Path(inputPath.toUri() + "/" + srcPath.getName());
    assertTrue(hdfs.exists(expPath));
    origList[3] = hdfs.getFileStatus(expPath);

    Path inputPath2 = new Path(inputPath.toUri() + "/"
            + "job_1311222222255_0221-1311111143227-user10101-WordCount-1-SUCCEEDED-default.jhist");
    hdfs.copyFromLocalFile(srcPath, inputPath2);
    assertTrue(hdfs.exists(inputPath2));
    origList[4] = hdfs.getFileStatus(inputPath2);

    Path inputPath3 = new Path(inputPath.toUri() + "/"
            + "job_1399999999155_0991-1311111143227-user3321-TeraGen-1-SUCCEEDED-default.jhist");
    hdfs.copyFromLocalFile(srcPath, inputPath3);
    assertTrue(hdfs.exists(inputPath3));
    origList[5] = hdfs.getFileStatus(inputPath3);

    Path inputPath4 = new Path(inputPath.toUri() + "/"
            + "job_1399977777177_0771-1311111143227-user3321-TeraSort-1-SUCCEEDED-default.jhist");
    hdfs.copyFromLocalFile(srcPath, inputPath4);
    assertTrue(hdfs.exists(inputPath4));
    origList[6] = hdfs.getFileStatus(inputPath4);

    Path emptyFile2 = new Path(
            inputPath.toUri() + "/" + "job_1329343333333_5551-1329111113227-user2-SomethingElse.jhist");
    os = hdfs.createNewFile(emptyFile2);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyFile2));
    origList[7] = hdfs.getFileStatus(emptyFile2);

    Path emptyConfFile2 = new Path(inputPath.toUri() + "/" + "job_1329343333333_5551_conf.xml");
    os = hdfs.createNewFile(emptyConfFile2);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile2));
    origList[8] = hdfs.getFileStatus(emptyConfFile2);

    // this is an empty file which tests the toBeRemovedFileList
    // at the end of function pruneFileListBySize
    Path emptyConfFile3 = new Path(inputPath.toUri() + "/" + "job_1399999999155_0991_conf.xml");
    os = hdfs.createNewFile(emptyConfFile3);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile3));
    origList[9] = hdfs.getFileStatus(emptyConfFile3);

    Path inputConfPath2 = new Path(inputPath.toUri() + "/" + "job_1311222222255_0221_conf.xml");
    srcPath = new Path(jobConfFile.toURI());
    hdfs.copyFromLocalFile(srcPath, inputConfPath2);
    assertTrue(hdfs.exists(inputConfPath2));
    origList[10] = hdfs.getFileStatus(inputConfPath2);

    // this is an empty file which tests the toBeRemovedFileList
    // at the end of function pruneFileListBySize
    Path emptyConfFile4 = new Path(inputPath.toUri() + "/" + "job_1399977777177_0771_conf.xml");
    os = hdfs.createNewFile(emptyConfFile4);
    assertTrue(os);
    assertTrue(hdfs.exists(emptyConfFile4));
    origList[11] = hdfs.getFileStatus(emptyConfFile4);

    FileStatus[] prunedList = FileLister.pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
    assertNotNull(prunedList);
    assertTrue(prunedList.length == 4);
}

From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java

License:Apache License

/**
 * Moves all the results of a collection of MR jobs to the final
 * output directory. Some of the results may have been put into a
 * temp location to work around restrictions with multiple output
 * from a single map reduce job./*from   www  . j a v  a2s  .  com*/
 *
 * This method should always be called after the job execution
 * completes.
 */
public void moveResults(List<Job> completedJobs) throws IOException {
    for (Job job : completedJobs) {
        Pair<List<POStore>, Path> pair = jobStoreMap.get(job);
        if (pair != null && pair.second != null) {
            Path tmp = pair.second;
            Path abs = new Path(tmp, "abs");
            Path rel = new Path(tmp, "rel");
            FileSystem fs = tmp.getFileSystem(conf);

            if (fs.exists(abs)) {
                moveResults(abs, abs.toUri().getPath(), fs);
            }

            if (fs.exists(rel)) {
                moveResults(rel, rel.toUri().getPath() + "/", fs);
            }
        }
    }
}

From source file:com.twitter.pig.backend.hadoop.executionengine.tez.TezJobControlCompiler.java

License:Apache License

private Path removePart(Path src, String part) {
    URI uri = src.toUri();
    String pathStr = uri.getPath().replace(part, "");
    return new Path(pathStr);
}

From source file:com.uber.hoodie.common.io.storage.HoodieWrapperFileSystem.java

License:Apache License

private static Path convertPathWithScheme(Path oldPath, String newScheme) {
    URI oldURI = oldPath.toUri();
    URI newURI;//from  www . ja  va2s. co m
    try {
        newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), oldURI.getPath(),
                oldURI.getQuery(), oldURI.getFragment());
        return new Path(newURI);
    } catch (URISyntaxException e) {
        // TODO - Better Exception handling
        throw new RuntimeException(e);
    }
}

From source file:com.uber.hoodie.common.table.timeline.dto.FilePathDTO.java

License:Apache License

public static FilePathDTO fromPath(Path path) {
    if (null == path) {
        return null;
    }/*from   w w w  .  ja v  a  2 s . c o  m*/
    FilePathDTO dto = new FilePathDTO();
    dto.uri = path.toUri().toString();
    return dto;
}