List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.datos.vfs.provider.hdfs.HdfsFileSystemConfigBuilder.java
License:Apache License
/** * Sets the full path of configuration file to be loaded after the defaults. * <p>//from ww w .ja v a 2s. c om * Specifies the path of a local file system config file to override any specific HDFS settings. * The property will be passed on to {@code org.apache.hadoop.conf.Configuration#addResource(Path)} * after the URL was set as the default name with: {@code Configuration#set(FileSystem.FS_DEFAULT_NAME_KEY, url)}. * <p> * One use for this is to set a different value for the {@code dfs.client.use.datanode.hostname} * property in order to access HDFS files stored in an AWS installation (from outside their * firewall). There are other possible uses too. * <p> * This method may be called multiple times and all the specified resources will be loaded * in the order they were specified. * * @param opts The FileSystemOptions to modify. * @param path full path of additional configuration file (local file system) or {@code null} * to unset all the path values set so far. */ public void setConfigPath(final FileSystemOptions opts, final Path path) { if (path == null) { this.setParam(opts, KEY_CONFIG_PATHS, null); } else { String previousPathNames = this.getString(opts, KEY_CONFIG_PATHS); if (previousPathNames == null || previousPathNames.isEmpty()) { this.setParam(opts, KEY_CONFIG_PATHS, path.toString()); } else { this.setParam(opts, KEY_CONFIG_PATHS, previousPathNames + "," + path.toString()); } } }
From source file:com.david.mos.out.FileOutputFormat.java
License:Apache License
/** * Set the {@link Path} of the output directory for the map-reduce job. * * @param job The job to modify//w w w.j a v a 2 s .co m * @param outputDir the {@link Path} of the output directory for * the map-reduce job. */ public static void setOutputPath(Job job, Path outputDir) { try { outputDir = outputDir.getFileSystem(job.getConfiguration()).makeQualified(outputDir); } catch (IOException e) { // Throw the IOException as a RuntimeException to be compatible with MR1 throw new RuntimeException(e); } job.getConfiguration().set(FileOutputFormat.OUTDIR, outputDir.toString()); }
From source file:com.davidgildeh.hadoop.utils.FileUtils.java
License:Apache License
/** * Merges a list of input files in a directory to a single file under the * outputpath with a specified filename// ww w . j a v a 2 s .c o m * * @param inputPath The input directory containing all the input files. E.g. /input/dir/on/hdfs/ * @param outputPath The output path to output the file. E.g. /output/dir/on/hdfs/filename * @throws IOException */ public static void mergeFiles(String inputPath, String outputPath) throws IOException { Path inputDir = new Path(inputPath); Path outputFile = new Path(outputPath); FileSystem fileSystem = getFileSystem(outputFile); checkFileExists(fileSystem, inputDir); // Check the input path is a directory if (!fileSystem.getFileStatus(inputDir).isDir()) { LOG.error("Path '" + inputDir.toString() + "' is not a directory."); throw new IOException("Path '" + inputDir.toString() + "' is not a directory."); } // Create Output File OutputStream out = fileSystem.create(outputFile); try { FileStatus contents[] = fileSystem.listStatus(inputDir); // Loop through all files in directory and merge them into one file for (int i = 0; i < contents.length; i++) { if (!contents[i].isDir()) { InputStream in = fileSystem.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, fileSystem.getConf(), false); } finally { in.close(); } } } } finally { out.close(); fileSystem.close(); LOG.info("Merged input files from '" + inputPath + "' to '" + outputPath + "'"); } }
From source file:com.davidgildeh.hadoop.utils.FileUtils.java
License:Apache License
/** * Check if a file exists, if not will throw a FileNotFoundException * //w w w.j av a2s . co m * @param path The path of the file to check * @throws IOException */ private static void checkFileExists(FileSystem fileSystem, Path path) throws IOException { // Check file exists if (!fileSystem.exists(path)) { LOG.error("Path '" + path.toString() + "' does not exist."); fileSystem.close(); throw new FileNotFoundException("Path '" + path.toString() + "' does not exist."); } }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("m", "model", true, "location of the model"); // parse the command line arguments CommandLine line = null;//from w w w. j a v a2 s .com try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); String model = line.getOptionValue("m"); if (line.hasOption("help")) { formatter.printHelp("ClassifierJob", options); return 0; } if (model == null | input == null | output == null) { formatter.printHelp("ClassifierJob", options); return -1; } } catch (ParseException e) { formatter.printHelp("ClassifierJob", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); String modelPath = line.getOptionValue("m"); JobConf job = new JobConf(getConf()); // push the model file to the DistributedCache DistributedCache.addCacheArchive(new URI(modelPath), job); job.setJarByClass(this.getClass()); job.setJobName("ClassifierJob : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TextClassifierMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.set(modelNameParam, modelPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.tika.TikaDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); GroupBuilder gBuilder = new GroupBuilder().withName("Options:"); List<Option> options = new ArrayList<Option>(); Option inputOpt = buildOption("input", "i", "The input path", true, true, null); options.add(inputOpt);/*from ww w . j a v a 2 s . c o m*/ Option outOpt = buildOption("output", "o", "The output path", true, true, null); options.add(outOpt); Option tikaOpt = buildOption("tikaProcessor", "t", "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true, false, null); options.add(tikaOpt); Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, ""); options.add(mimeTypeOpt); for (Option opt : options) { gBuilder = gBuilder.withOption(opt); } Group group = gBuilder.create(); try { Parser parser = new Parser(); parser.setGroup(group); // TODO catch exceptions with parsing of opts CommandLine cmdLine = parser.parse(args); Path inputPath = new Path(cmdLine.getValue(inputOpt).toString()); Path outputPath = new Path(cmdLine.getValue(outOpt).toString()); String handlerName = null; if (cmdLine.hasOption(tikaOpt)) { handlerName = cmdLine.getValue(tikaOpt).toString(); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); if (cmdLine.hasOption(mimeTypeOpt)) { String mimeType = cmdLine.getValue(mimeTypeOpt).toString(); job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType); } if (handlerName != null && handlerName.equals("") == false) { job.set(TIKA_PROCESSOR_KEY, handlerName); } job.setJobName("Tika : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TikaMapper.class); boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("TikaDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error("Exception", e); return -1; // don't delete the output as some of it could be used // fs.delete(outputPath, true); } finally { } } catch (OptionException e) { log.error("OptionException", e.getMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setGroup(group); formatter.print(); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusFilter.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;/*w w w . j a v a2 s . c o m*/ try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); if (line.hasOption("help")) { formatter.printHelp("CorpusFilter", options); return 0; } if (input == null | output == null) { formatter.printHelp("CorpusFilter", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusFilter", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("CorpusFilter : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); boolean isFilterRequired = BehemothMapper.isRequired(job); // should be the case here if (!isFilterRequired) { System.err.println("No filters configured. Check your behemoth-site.xml"); return -1; } job.setMapperClass(BehemothMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Set the array of {@link Path}s as the list of inputs * for the map-reduce job.// w w w . j a v a2s. co m * * @param job The job to modify * @param inputPaths the {@link Path}s of the input directories/files * for the map-reduce job. */ public static void setInputPaths(Job job, Path... inputPaths) throws IOException { Configuration conf = job.getConfiguration(); Path path = inputPaths[0].getFileSystem(conf).makeQualified(inputPaths[0]); StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString())); for (int i = 1; i < inputPaths.length; i++) { str.append(StringUtils.COMMA_STR); path = inputPaths[i].getFileSystem(conf).makeQualified(inputPaths[i]); str.append(StringUtils.escapeString(path.toString())); } conf.set(INPUT_DIR, str.toString()); }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Add a {@link Path} to the list of inputs for the map-reduce job. * //from w w w . j av a 2s . c o m * @param job The {@link Job} to modify * @param path {@link Path} to be added to the list of inputs for * the map-reduce job. */ public static void addInputPath(Job job, Path path) throws IOException { Configuration conf = job.getConfiguration(); path = path.getFileSystem(conf).makeQualified(path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get(INPUT_DIR); conf.set(INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr); }
From source file:com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder.java
License:Apache License
/** * Specify the input path(s) of a {@link Dataset}. * /*from w w w . ja va 2 s. co m*/ * @param paths one or more path that contain the dataset of * @return the builder itself. * @throws IOException */ public ACTUAL_BUILDER_IMPL addInputPath(Path... paths) throws IOException { for (Path anInput : paths) { LOGGER.info("Adding an input path:" + anInput.toString()); if (this.mobiusJob.isOutputOfAnotherJob(anInput)) { this.addInputPath(false, anInput); } else { this.addInputPath(true, anInput); } } return (ACTUAL_BUILDER_IMPL) this; }