List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:cascading.tap.Hfs.java
License:Open Source License
private void makeLocal(JobConf conf, Path qualifiedPath, String infoMessage) { if (!conf.get("mapred.job.tracker", "").equalsIgnoreCase("local") && qualifiedPath.toUri().getScheme().equalsIgnoreCase("file")) { if (LOG.isInfoEnabled()) LOG.info(infoMessage + toString()); conf.set("mapred.job.tracker", "local"); // force job to run locally }//from w ww . j ava2 s . c o m }
From source file:cascading.util.S3Util.java
License:Open Source License
@Deprecated public static String getKeyFrom(Path path) { return path.toUri().getPath().substring(1); }
From source file:cc.slda.AnnotateDocuments.java
License:Apache License
/** * Runs this tool.//www .jav a2 s .c o m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); options.addOption(OptionBuilder.withArgName(PCUTOFF).hasArg() .withDescription("probability of topic assignment").create(PCUTOFF)); options.addOption(OptionBuilder.withArgName(INDEX).hasArg() .withDescription("path to data directory containing term and title indices").create(INDEX)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(INDEX)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String indexPath = cmdline.getOptionValue(INDEX); String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; float cutoff = 0.9f; if (cmdline.hasOption(PCUTOFF)) { cutoff = Float.parseFloat(cmdline.getOptionValue(PCUTOFF)); } LOG.info("Tool: " + AnnotateDocuments.class.getSimpleName()); LOG.info(" - indices path: " + indexPath); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); LOG.info(" - log(probCutoff): " + Math.log(cutoff)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Job job = Job.getInstance(conf); job.setJobName(AnnotateDocuments.class.getSimpleName()); job.setJarByClass(AnnotateDocuments.class); String termIndex = indexPath + Path.SEPARATOR + TERM; String titleIndex = indexPath + Path.SEPARATOR + TITLE; Path termIndexPath = new Path(termIndex); Path titleIndexPath = new Path(titleIndex); Preconditions.checkArgument(fs.exists(termIndexPath), "Missing term index files... " + termIndexPath); DistributedCache.addCacheFile(termIndexPath.toUri(), job.getConfiguration()); Preconditions.checkArgument(fs.exists(titleIndexPath), "Missing title index files... " + titleIndexPath); DistributedCache.addCacheFile(titleIndexPath.toUri(), job.getConfiguration()); job.setNumReduceTasks(reduceTasks); conf.setFloat(PCUTOFF, cutoff); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(HMapSIW.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HMapSIW.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:cc.solr.lucene.store.hdfs.ChangeFileExt.java
License:Apache License
public static void main(String[] args) throws IOException { Path p = new Path(args[0]); FileSystem fileSystem = FileSystem.get(p.toUri(), new Configuration()); FileStatus[] listStatus = fileSystem.listStatus(p); for (FileStatus fileStatus : listStatus) { Path path = fileStatus.getPath(); fileSystem.rename(path, new Path(path.toString() + ".lf")); }// ww w . j a va 2 s .c om }
From source file:cc.solr.lucene.store.hdfs.ConvertDirectory.java
License:Apache License
public static void convert(Path path) throws IOException { FileSystem fileSystem = FileSystem.get(path.toUri(), new Configuration()); if (!fileSystem.exists(path)) { System.out.println(path + " does not exists."); return;/* w w w .j a v a2 s.c o m*/ } FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { FileStatus[] listStatus = fileSystem.listStatus(path); for (FileStatus status : listStatus) { convert(status.getPath()); } } else { System.out.println("Converting file [" + path + "]"); HdfsMetaBlock block = new HdfsMetaBlock(); block.realPosition = 0; block.logicalPosition = 0; block.length = fileStatus.getLen(); FSDataOutputStream outputStream = fileSystem.append(path); block.write(outputStream); outputStream.writeInt(1); outputStream.writeLong(fileStatus.getLen()); outputStream.writeInt(HdfsFileWriter.VERSION); outputStream.close(); } }
From source file:cmd.download.java
License:Apache License
private void mergeToLocalFile(FileSystem fs, Path src, String outPath, Configuration configuration) throws FileNotFoundException, IOException { FileStatus[] status = fs.listStatus(src); Map<String, Path> paths = new TreeMap<String, Path>(); for (FileStatus fileStatus : status) { Path path = fileStatus.getPath(); String pathName = path.getName(); if (pathName.startsWith(Constants.NAME_SECOND)) { paths.put(pathName, path);/*from ww w. j a v a 2 s . c o m*/ } } File outFile = new File(outPath, Names.indexId2Node + ".dat"); OutputStream out = new FileOutputStream(outFile); for (String pathName : paths.keySet()) { Path path = new Path(src, paths.get(pathName)); log.debug("Concatenating {} into {}...", path.toUri(), outFile.getAbsoluteFile()); InputStream in = fs.open(new Path(path, Names.indexId2Node + ".dat")); IOUtils.copyBytes(in, out, configuration, false); in.close(); } out.close(); }
From source file:cmd.download.java
License:Apache License
private void mergeToLocalFile2(FileSystem fs, Path src, String outPath, Configuration configuration) throws FileNotFoundException, IOException { // Find all the right paths and copy .gz files locally FileStatus[] status = fs.listStatus(src); Map<String, Path> paths = new TreeMap<String, Path>(); for (FileStatus fileStatus : status) { Path path = fileStatus.getPath(); String pathName = path.getName(); if (pathName.startsWith(Constants.NAME_FOURTH)) { paths.put(pathName, path);//from ww w . j a va 2 s . c o m } } for (String pathName : paths.keySet()) { Path path = new Path(src, paths.get(pathName)); status = fs.listStatus(path); for (FileStatus fileStatus : status) { Path p = fileStatus.getPath(); log.debug("Copying {} to {}...", p.toUri(), outPath); fs.copyToLocalFile(p, new Path(outPath, p.getName())); } } // Merge .gz files into indexName.gz File fileOutputPath = new File(outPath); File[] files = fileOutputPath.listFiles(new FileFilter() { @Override public boolean accept(File pathname) { return pathname.getName().endsWith(".gz"); } }); Arrays.sort(files); String prevIndexName = null; OutputStream out = null; for (File file : files) { log.debug("Processing {}... ", file.getName()); String indexName = file.getName().substring(0, file.getName().indexOf("_")); if (prevIndexName == null) prevIndexName = indexName; if (out == null) out = new GZIPOutputStream(new FileOutputStream(new File(outPath, indexName + ".gz"))); if (!prevIndexName.equals(indexName)) { if (out != null) out.close(); log.debug("Index name set to {}", indexName); out = new GZIPOutputStream(new FileOutputStream(new File(outPath, indexName + ".gz"))); } InputStream in = new GZIPInputStream(new FileInputStream(file)); log.debug("Copying {} into {}.gz ...", file.getName(), indexName); IOUtils.copyBytes(in, out, 8192, false); in.close(); file.delete(); prevIndexName = indexName; } if (out != null) out.close(); // build B+Tree indexes Location location = new Location(outPath); for (String idxName : Constants.indexNames) { log.debug("Creating {} index...", idxName); String indexFilename = location.absolute(idxName, "gz"); if (new File(indexFilename).exists()) { new File(outPath, idxName + ".dat").delete(); new File(outPath, idxName + ".idn").delete(); CmdIndexBuild.main(location.getDirectoryPath(), idxName, indexFilename); // To save some disk space new File(indexFilename).delete(); } } }
From source file:cmd.tdbloader4.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }//from w ww . j a va 2 s . c o m Configuration configuration = getConf(); configuration.set(Constants.RUN_ID, String.valueOf(System.currentTimeMillis())); boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); boolean copyToLocal = configuration.getBoolean(Constants.OPTION_COPY_TO_LOCAL, Constants.OPTION_COPY_TO_LOCAL_DEFAULT); boolean verify = configuration.getBoolean(Constants.OPTION_VERIFY, Constants.OPTION_VERIFY_DEFAULT); boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_1), true); fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_2), true); fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_3), true); fs.delete(new Path(args[1] + OUTPUT_PATH_POSTFIX_4), true); } if ((copyToLocal) || (runLocal)) { File path = new File(args[1]); path.mkdirs(); } Tool first = new FirstDriver(configuration); int status = first.run(new String[] { args[0], args[1] + OUTPUT_PATH_POSTFIX_1 }); if (status != 0) { return status; } createOffsetsFile(fs, args[1] + OUTPUT_PATH_POSTFIX_1, args[1] + OUTPUT_PATH_POSTFIX_1); Path offsets = new Path(args[1] + OUTPUT_PATH_POSTFIX_1, Constants.OFFSETS_FILENAME); DistributedCache.addCacheFile(offsets.toUri(), configuration); Tool second = new SecondDriver(configuration); status = second.run(new String[] { args[0], args[1] + OUTPUT_PATH_POSTFIX_2 }); if (status != 0) { return status; } Tool third = new ThirdDriver(configuration); status = third.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_2, args[1] + OUTPUT_PATH_POSTFIX_3 }); if (status != 0) { return status; } Tool fourth = new FourthDriver(configuration); status = fourth.run(new String[] { args[1] + OUTPUT_PATH_POSTFIX_3, args[1] + OUTPUT_PATH_POSTFIX_4 }); if (status != 0) { return status; } if (copyToLocal) { Tool download = new download(configuration); download.run( new String[] { args[1] + OUTPUT_PATH_POSTFIX_2, args[1] + OUTPUT_PATH_POSTFIX_4, args[1] }); } if (verify) { DatasetGraphTDB dsgMem = load(args[0]); Location location = new Location(args[1]); if (!copyToLocal) { // TODO: this is a sort of a cheat and it could go away (if it turns out to be too slow)! download.fixNodeTable2(location); } DatasetGraphTDB dsgDisk = SetupTDB.buildDataset(location); boolean isomorphic = isomorphic(dsgMem, dsgDisk); System.out.println("> " + isomorphic); } return status; }
From source file:cn.uway.util.apache.parquet.hadoop.ParquetFileWriter.java
License:Apache License
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) { String rootPath = root.toUri().getPath(); GlobalMetaData fileMetaData = null;//from w ww.j a v a2 s. c om List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { String footerPath = footer.getFile().toUri().getPath(); if (!footerPath.startsWith(rootPath)) { throw new ParquetEncodingException( footerPath + " invalid: all the files must be contained in the root " + root); } footerPath = footerPath.substring(rootPath.length()); while (footerPath.startsWith("/")) { footerPath = footerPath.substring(1); } fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData); for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) { block.setPath(footerPath); blocks.add(block); } } return new ParquetMetadata(fileMetaData.merge(), blocks); }
From source file:co.cask.cdap.data.stream.StreamDataFileSplitter.java
License:Apache License
private Path getIndexFile(Path eventFile) { String eventPath = eventFile.toUri().toString(); int extLength = StreamFileType.EVENT.getSuffix().length(); return new Path(URI.create(String.format("%s%s", eventPath.substring(0, eventPath.length() - extLength), StreamFileType.INDEX.getSuffix()))); }