List of usage examples for org.apache.hadoop.fs Path makeQualified
@Deprecated
public Path makeQualified(FileSystem fs)
From source file:org.commoncrawl.mapred.ec2.parser.OutputCommitter.java
License:Open Source License
Path getTempTaskOutputPath(TaskAttemptContext taskContext) { JobConf conf = taskContext.getJobConf(); Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { Path p = new Path(outputPath, (FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR + "_" + taskContext.getTaskAttemptID().toString())); try {// w w w . ja v a2 s . c om FileSystem fs = p.getFileSystem(conf); return p.makeQualified(fs); } catch (IOException ie) { LOG.warn(StringUtils.stringifyException(ie)); return p; } } return null; }
From source file:org.elasticsearch.repositories.hdfs.HdfsSnapshotRestoreTest.java
License:Apache License
/** * Deletes content of the repository files in the bucket *//* w w w . j av a 2 s .c o m*/ public void cleanRepositoryFiles(String basePath) throws IOException { LocalFileSystem fs = FileSystem.getLocal(new Configuration()); Path p = new Path(path); fs.delete(p.makeQualified(fs), true); }
From source file:org.kiji.mapreduce.kvstore.FileKeyValueArrayStore.java
License:Apache License
/** {@inheritDoc} */ @Override//from w ww . jav a 2s. co m public void storeToConf(KeyValueStoreConfiguration conf) throws IOException { if (mInputPaths.isEmpty()) { throw new IOException("Required attribute not set: input path"); } conf.setLong(CONF_MAX_VALUES, mMaxValues); if (mUseDCache && !"local".equals(conf.getDelegate().get("mapred.job.tracker", ""))) { // If we're scheduled to use the distributed cache, and we're not in the LocalJobRunner, // add these files to the DistributedCache. // TODO(WIBI-1653): This does not handle any sort of MapperTester, etc. // We need a separate flag that tells this to ignore mUseDCache if we're in a test // environment, and just use the original input file specs. final String dCachePrefix = getCachePrefix(); // Associate this randomly chosen prefix id with this KVStore implementation. conf.set(CONF_DCACHE_PREFIX, dCachePrefix); // Add the input paths to the DistributedCache and translate path names. int uniqueId = 0; for (Path inputPath : getExpandedInputPaths()) { FileSystem fs = inputPath.getFileSystem(conf.getDelegate()); Path absolutePath = inputPath.makeQualified(fs); String uriStr = absolutePath.toString() + "#" + dCachePrefix + "-" + uniqueId; LOG.debug("Adding to DistributedCache: " + uriStr); uniqueId++; try { DistributedCache.addCacheFile(new URI(uriStr), conf.getDelegate()); } catch (URISyntaxException use) { throw new IOException("Could not construct URI for file: " + uriStr, use); } } // Ensure that symlinks are created for cached files. DistributedCache.createSymlink(conf.getDelegate()); // Now save the cache prefix into the local state. We couldn't set this earlier, // because we wanted getExpandedInputPaths() to actually unglob things. That // function will behave differently if mDCachePrefix is already initialized. mDCachePrefix = dCachePrefix; } else { // Just put the regular HDFS paths in the Configuration. conf.setStrings(CONF_PATHS, Lists.toArray(Lists.map(mInputPaths, new Lists.ToStringFn<Path>()), String.class)); } }
From source file:org.kiji.mapreduce.kvstore.lib.FileStoreHelper.java
License:Apache License
/** * Serializes file- and DistributedCache-specific properties associated * with the KeyValueStore that owns this FileStoreHelper to the specified configuration. * * @param conf the configuration to populate. * @throws IOException if there's an error serializing the state. *//*from w w w . j a v a2 s .com*/ public void storeToConf(KeyValueStoreConfiguration conf) throws IOException { if (mInputPaths.isEmpty()) { throw new IOException("Required attribute not set: input path"); } if (mUseDCache && !"local".equals(conf.get("mapred.job.tracker", ""))) { // If we're scheduled to use the distributed cache, and we're not in the LocalJobRunner, // add these files to the DistributedCache. // TODO(aaron): This does not handle any sort of MapperTester, etc. // We need a separate flag that tells this to ignore mUseDCache if we're in a test // environment, and just use the original input file specs. final String dCachePrefix = getCachePrefix(); // Associate this randomly chosen prefix id with this KVStore implementation. conf.set(CONF_DCACHE_PREFIX_KEY, dCachePrefix); // Add the input paths to the DistributedCache and translate path names. int uniqueId = 0; for (Path inputPath : getExpandedInputPaths()) { FileSystem fs = inputPath.getFileSystem(conf.getDelegate()); Path absolutePath = inputPath.makeQualified(fs); String uriStr = absolutePath.toString() + "#" + dCachePrefix + "-" + uniqueId; LOG.debug("Adding to DistributedCache: " + uriStr); uniqueId++; try { DistributedCache.addCacheFile(new URI(uriStr), conf.getDelegate()); } catch (URISyntaxException use) { throw new IOException("Could not construct URI for file: " + uriStr, use); } } // Ensure that symlinks are created for cached files. DistributedCache.createSymlink(conf.getDelegate()); // Now save the cache prefix into the local state. We couldn't set this earlier, // because we wanted getExpandedInputPaths() to actually unglob things. That // function will behave differently if mDCachePrefix is already initialized. mDCachePrefix = dCachePrefix; } else { // Just put the regular HDFS paths in the Configuration. conf.setStrings(CONF_PATHS_KEY, Lists.toArray(Lists.map(mInputPaths, new Lists.ToStringFn<Path>()), String.class)); } }
From source file:org.kiji.mapreduce.output.HFileMapReduceJobOutput.java
License:Apache License
/** * Configures the partitioner for generating HFiles. * * <p>Each generated HFile should fit within a region of of the target table. * Additionally, it's optimal to have only one HFile to load into each region, since a * read from that region will require reading from each HFile under management (until * compaction happens and merges them all back into one HFile).</p> * * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the * records output from the Mapper based on their rank in a total ordering of the * keys. The <code>startKeys</code> argument should contain a list of the first key in * each of those partitions.</p>//from w w w .j a v a 2 s .c o m * * @param job The job to configure. * @param startKeys A list of keys that will mark the boundaries between the partitions * for the sorted map output records. * @throws IOException If there is an error. */ private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException { job.setPartitionerClass(TotalOrderPartitioner.class); LOG.info("Configuring " + startKeys.size() + " reduce partitions."); job.setNumReduceTasks(startKeys.size()); // Write the file that the TotalOrderPartitioner reads to determine where to partition records. Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionFilePath); final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration()); partitionFilePath = partitionFilePath.makeQualified(fs); writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys); // Add it to the distributed cache. try { final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(cacheUri, job.getConfiguration()); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.createSymlink(job.getConfiguration()); }
From source file:org.mitre.ccv.mapred.CompleteCompositionVectors.java
License:Open Source License
/** * * The JSO data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features * will be in a different order. This version, by default sorts, only by entropy values, whereas the * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic). * @param argv/*from w w w . ja v a2s . c o m*/ * @return * @throws java.lang.Exception */ @Override @SuppressWarnings("static-access") // For OptionBuilder public int run(String[] argv) throws Exception { JobConf conf = new JobConf(getConf()); String cli_title = "CompleteCompositionVectorHadoop"; int start = CalculateKmerCounts.DEFAULT_START; int end = CalculateKmerCounts.DEFAULT_END; int topkmers = 0; String input = null; String output = null; String vectorJsonOutput = null; //String kmerJsonOutput = null; boolean cleanLogs = false; /** create the Options */ Options options = new Options(); /** Hadoop Options */ options.addOption( OptionBuilder.withArgName("number").hasArg(true).withDescription("number of maps").create("m")); options.addOption( OptionBuilder.withArgName("number").hasArg(true).withDescription("number of reducers").create("r")); // org.hadoop.util.GenericOptionsParser should captures this, but it doesn't options.addOption(OptionBuilder.withArgName("property=value").hasArg(true).withValueSeparator() .withDescription("use value for given property").create("D")); /** CompleteCompositionVector Options */ options.addOption(OptionBuilder.withArgName("number").hasArg(true) .withDescription("number of top k-mers to use in calculations").create("topKmers")); options.addOption(OptionBuilder.withArgName("start").hasArg(true).withDescription("starting length of tile") .create("start")); options.addOption(OptionBuilder.withArgName("end").hasArg(true).withDescription("ending length of title") .create("end")); options.addOption(OptionBuilder.hasArg(true).withArgName("file") .withDescription("JSON file to write out k-mers to").create("kmersfile")); options.addOption(OptionBuilder.hasArg(true).withArgName("file") .withDescription("JSON file to write out feature vectors to " + "(Overrides kmersout, only one file will be written).") .create("vectorsfile")); options.addOption(OptionBuilder.withArgName("number").hasArg(true) .withDescription("What preference to use: 0-min 1-median 2-avg(min,med): default is median") .create("prefval")); options.addOption(OptionBuilder.withArgName("help").hasArg(false).withDescription("print this message") .create("help")); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); //GenericOptionsParser gop = new GenericOptionsParser(conf, options, argv); GenericOptionsParser gop = new GenericOptionsParser(conf, argv); String[] remaining_args = gop.getRemainingArgs(); // create the parser CommandLineParser parser = new GnuParser(); //CommandLine line = gop.getCommandLine(); String[] other_args = new String[] {}; try { CommandLine line = parser.parse(options, remaining_args); other_args = line.getArgs(); // Make sure there is a parameter left. if (other_args.length == 0) { System.out.println(cli_title); System.out.println("Missing input path!"); formatter.printHelp("hccv [options] <input> [<output>] ", options); GenericOptionsParser.printGenericCommandUsage(System.out); return -1; } Option[] opts = line.getOptions(); if (line.hasOption("help")) { System.out.println(cli_title); formatter.printHelp("hccv [options] <input> [<output>] ", options); GenericOptionsParser.printGenericCommandUsage(System.out); return -1; } // could also use line.iterator() for (Option opt : opts) { if (opt.getOpt().equals("m")) { conf.setNumMapTasks(Integer.parseInt(opt.getValue())); } if (opt.getOpt().equals("r")) { conf.setNumReduceTasks(Integer.parseInt(opt.getValue())); } if (opt.getOpt().equals("D")) { // We can have multiple properties we want to set String[] properties = opt.getValues(); for (String property : properties) { String[] keyval = property.split("="); conf.set(keyval[0], keyval[1]); } } if (opt.getOpt().equals("start")) { start = Integer.parseInt(opt.getValue()); } if (opt.getOpt().equals("end")) { end = Integer.parseInt(opt.getValue()); } if (opt.getOpt().equals("topKmers")) { topkmers = Integer.parseInt(opt.getValue()); } if (opt.getOpt().equals("vectorsfile")) { vectorJsonOutput = opt.getValue(); } } } catch (ParseException e) { LOG.warn("options parsing faild: " + e.getMessage()); System.out.println(cli_title); formatter.printHelp("hccv [options] <input> [<output>] ", options); GenericOptionsParser.printGenericCommandUsage(System.out); } if (start <= 2) { throw new IllegalArgumentException("Value of 'start' argument must be larger than 2"); } input = other_args[0]; if (other_args.length < 2) { output = input + "_" + FileUtils.getSimpleDate(); } else { output = other_args[2]; } /** * Check output path. Either needs to exist as a directory or not exist */ Path outputPath = new Path(output); FileSystem fs = outputPath.getFileSystem(conf); if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); } else if (fs.exists(outputPath) || !fs.getFileStatus(outputPath).isDir()) { LOG.fatal(String.format("Output directory %s already exists", outputPath.makeQualified(fs))); throw new FileAlreadyExistsException( String.format("Output directory %s already exists", outputPath.makeQualified(fs))); } String outputDir = output + Path.SEPARATOR; int res; /** * Zero, CalculateCompositionVectors */ LOG.info("Starting CalculateCompositionVectors Map-Reduce job"); CalculateCompositionVectors cv = new CalculateCompositionVectors(); res = cv.initJob(conf, start, end, input, outputDir + COMPOSITION_VECTORS, cleanLogs); if (res != 0) { LOG.info("CalculateCompositionVectors returned non-zero result!"); return res; } // We can stop now or continue to reduce dimensionallity using RRE or other means /** * First, CalculateKmerCounts */ LOG.info("Starting CalculateKmerCounts Map-Reduce job"); // FastMap option for CalculateKmers!?! CalculateKmerCounts ckc = new CalculateKmerCounts(); res = ckc.initJob(conf, start, end, input, outputDir + KMER_COUNTS); if (res != 0) { LOG.fatal("CalculateKmerCounts returned non-zero result!"); return res; } /** * Second, TotalSequenceLength */ LOG.info("Starting TotalSequenceLength Map-Reduce job"); TotalSequenceLength tsl = new TotalSequenceLength(); res = tsl.initJob(conf, input, outputDir + TOTAL_LENGTH, cleanLogs); if (res != 0) { LOG.fatal("TotalSequenceLength returned non-zero result!"); return res; } int length = tsl.getCount(conf, outputDir + TOTAL_LENGTH); if (length < 3) { LOG.fatal("TotalSequenceLength returned a total sequence length of less than 3."); return -1; } else { LOG.info(String.format("TotalSequenceLength returned a total sequence length of %d.", length)); } /** * Third, CalculateKmerProbabilities */ LOG.info("Starting CalculateKmerProbabilities Map-Reduce job"); CalculateKmerProbabilities ckp = new CalculateKmerProbabilities(); res = ckp.initJob(conf, start, end, length, outputDir + KMER_COUNTS, outputDir + KMER_PROBABILITIES, cleanLogs); if (res != 0) { LOG.fatal("CalculateKmerProbabilities returned non-zero result!"); return res; } /** * Fourth, InvertKmerProbabilities */ LOG.info("Starting InvertKmerProbabilities Map-Reduce job"); InvertKmerProbabilities ikp = new InvertKmerProbabilities(); res = ikp.initJob(conf, outputDir + KMER_PROBABILITIES, outputDir + INVERTED_KMER_PROBABILITIES, cleanLogs); if (res != 0) { LOG.fatal("InvertKmerProbabilities returned non-zero result!"); return res; } /** * Fifth, CalculateKmerPiValues */ LOG.info("Starting CalculateKmerPiValues Map-Reduce job"); CalculateKmerPiValues kpv = new CalculateKmerPiValues(); res = kpv.initJob(conf, start, end, outputDir + INVERTED_KMER_PROBABILITIES, outputDir + KMER_PI_VALUES, cleanLogs); if (res != 0) { LOG.fatal("CalculateKmerPiValues returned non-zero result!"); return res; } /** * Sixth,CalculateKmerRevisedRelativeEntropy */ LOG.info("Starting CalculateKmerRevisedRelativeEntropy Map-Reduce job"); CalculateKmerRevisedRelativeEntropy krre = new CalculateKmerRevisedRelativeEntropy(); res = krre.initJob(conf, outputDir + KMER_PI_VALUES, outputDir + COMPOSITION_VECTORS, outputDir + ENTROPY_VALUES, cleanLogs); if (res != 0) { LOG.fatal("CalculateKmerRevisedRelativeEntropy returned non-zero result!"); return res; } /** * Seventh, SortKmerRevisedRelativeEntropies */ SortKmerRevisedRelativeEntropies srre = new SortKmerRevisedRelativeEntropies(); res = srre.initJob(conf, outputDir + ENTROPY_VALUES, outputDir + SORTED_ENTROPY_VALUES, cleanLogs); if (res != 0) { LOG.fatal("SortKmerRevisedRelativeEntropies returned non-zero result!"); return res; } /** * Eigth, GenerateFeatureVectors * * Generate a flatten list to add to the cache to be distributed to the map-tasks. */ Path listOutputPath = new Path(outputDir + Integer.toString(topkmers) + KMER_ENTROPY_SET); LOG.info(String.format("Loading %d sorted k-mers from %s to %s", topkmers, outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString())); int num = CompleteCompositionVectorUtils.flattenKmerEntropySequenceFile(conf, topkmers, outputDir + SORTED_ENTROPY_VALUES, listOutputPath.toString(), cleanLogs); if (num != topkmers) { LOG.fatal(String.format("Requested %d k-mers, but got %d. Using %d", topkmers, num, num)); topkmers = num; } GenerateFeatureVectors fv = new GenerateFeatureVectors(); res = fv.initJob(conf, listOutputPath.toString(), topkmers, outputDir + COMPOSITION_VECTORS, outputDir + FEATURE_VECTORS, cleanLogs); if (res != 0) { LOG.fatal("GenerateFeatureVectors returned non-zero result!"); return res; } /** * Save feature vectors, features (k-mers), and properties to a JSON file. * * The data will be the same as {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features * will be in a different order. This version, by default sorts, only by entropy values, whereas the * ccv in-memory version sorts by the k-mer natural order (i.e., lexigraphic). */ if (vectorJsonOutput != null && vectorJsonOutput.length() > 0) { LOG.info("Writing features out to " + vectorJsonOutput); CompleteCompositionVectorUtils.featureVectors2Json(conf, start, end, topkmers, outputDir + SORTED_ENTROPY_VALUES, outputDir + FEATURE_VECTORS, vectorJsonOutput); } LOG.info("All done generating complete composition vectors and feature vectors."); return res; }
From source file:org.mitre.mapred.fs.FileUtils.java
License:Open Source License
/** * Takes input as a comma separated list of files * and verifies if they exist. It defaults for file:/// * if the files specified do not have a scheme. * it returns the paths uri converted defaulting to file:///. * So an input of /home/user/file1,/home/user/file2 would return * file:///home/user/file1,file:///home/user/file2 * * @see org.apache.hadoop.util.GenericOptionsParser#validateFiles(java.lang.String, org.apache.hadoop.conf.Configuration) * @param files// w w w.j av a 2 s . c om * @return the paths converted to URIs */ public static String validateFiles(String files, Configuration conf) throws IOException { if (files == null) { return null; } String[] fileArr = files.split(","); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; Path path = new Path(tmp); URI pathURI = path.toUri(); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { //default to the local file system //check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); try { fs.close(); } catch (IOException e) { } } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); }
From source file:org.mrgeo.cmd.ingestvector.IngestVector.java
License:Apache License
List<String> getInputs(String arg, boolean recurse) throws IOException { GeotoolsVectorReader reader = null;/*from w w w. j av a 2 s .c o m*/ List<String> inputs = new LinkedList<String>(); File f = new File(arg); URI uri = f.toURI(); // recurse through directories if (f.isDirectory()) { File[] dir = f.listFiles(); for (File s : dir) { try { if (s.isFile() || (s.isDirectory() && recurse)) { inputs.addAll(getInputs(s.getCanonicalPath(), recurse)); } } catch (IOException e) { } } } else if (f.isFile()) { // is this a valid file? System.out.print("*** checking " + f.getCanonicalPath()); try { reader = GeotoolsVectorUtils.open(uri); if (reader != null) { System.out.println(" accepted ***"); inputs.add(uri.toString()); } else { System.out.println(" can't load ***"); } } catch (IOException e) { System.out.println(" can't load ***"); } } else { Path p = new Path(arg); FileSystem fs = HadoopFileUtils.getFileSystem(config, p); if (fs.exists(p)) { FileStatus status = fs.getFileStatus(p); if (status.isDir() && recurse) { FileStatus[] files = fs.listStatus(p); for (FileStatus file : files) { inputs.addAll(getInputs(file.getPath().toString(), recurse)); } } else { // is this a valid file? System.out.print("*** checking " + p.toString()); try { reader = GeotoolsVectorUtils.open(p.makeQualified(fs).toUri()); if (reader != null) { System.out.println(" accepted ***"); inputs.add(p.toString()); } else { System.out.println(" can't load ***"); } } catch (IOException e) { System.out.println(" can't load ***"); } } } } return inputs; }
From source file:org.mrgeo.format.DirectoryInputFormat.java
License:Apache License
public static void setParentDirectory(Job job, Path parent) throws IOException { Configuration conf = job.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path path = parent.makeQualified(fs); conf.set("mapred.input.dir", StringUtils.escapeString(path.toString())); }
From source file:org.mrgeo.hdfs.utils.HadoopFileUtils.java
License:Apache License
/** * Deletes the specified path. If the scheme is s3 or s3n, then it will wait * until the path is gone to return or else throw an IOException indicating * that the path still exists. This is because s3 operates under eventual * consistency so deletes are not guarantted to happen right away. * * @param conf/* w ww.jav a 2s .c o m*/ * @param path * @throws IOException */ public static void delete(final Configuration conf, final Path path) throws IOException { final FileSystem fs = getFileSystem(conf, path); if (fs.exists(path)) { log.info("Deleting path " + path.toString()); if (fs.delete(path, true) == false) { throw new IOException("Error deleting directory " + path.toString()); } Path qualifiedPath = path.makeQualified(fs); URI pathUri = qualifiedPath.toUri(); String scheme = pathUri.getScheme().toLowerCase(); if ("s3".equals(scheme) || "s3n".equals(scheme)) { boolean stillExists = fs.exists(path); int sleepIndex = 0; // Wait for S3 to finish the deletion in phases - initially checking // more frequently and then less frequently as time goes by. int[][] waitPhases = { { 60, 1 }, { 120, 2 }, { 60, 15 } }; while (sleepIndex < waitPhases.length) { int waitCount = 0; log.info("Sleep index " + sleepIndex); while (stillExists && waitCount < waitPhases[sleepIndex][0]) { waitCount++; log.info("Waiting " + waitPhases[sleepIndex][1] + " seconds " + path.toString() + " to be deleted"); try { Thread.sleep(waitPhases[sleepIndex][1] * 1000L); } catch (InterruptedException e) { log.warn("While waiting for " + path.toString() + " to be deleted", e); } stillExists = fs.exists(path); log.info("After waiting exists = " + stillExists); } sleepIndex++; } if (stillExists) { throw new IOException(path.toString() + " was not deleted within the waiting period"); } } } else { log.info("Path already does not exist " + path.toString()); } }