Example usage for org.apache.hadoop.fs Path toString

List of usage examples for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:com.datos.vfs.provider.hdfs.HdfsFileSystemConfigBuilder.java

License:Apache License

/**
 * Sets the full path of configuration file to be loaded after the defaults.
 * <p>//from   ww  w  .ja  v a  2s.  c  om
 * Specifies the path of a local file system config file to override any specific HDFS settings.
 * The property will be passed on to {@code org.apache.hadoop.conf.Configuration#addResource(Path)}
 * after the URL was set as the default name with: {@code Configuration#set(FileSystem.FS_DEFAULT_NAME_KEY, url)}.
 * <p>
 * One use for this is to set a different value for the {@code dfs.client.use.datanode.hostname}
 * property in order to access HDFS files stored in an AWS installation (from outside their
 * firewall). There are other possible uses too.
 * <p>
 * This method may be called multiple times and all the specified resources will be loaded
 * in the order they were specified.
 *
 * @param opts The FileSystemOptions to modify.
 * @param path full path of additional configuration file (local file system) or {@code null}
 * to unset all the path values set so far.
 */
public void setConfigPath(final FileSystemOptions opts, final Path path) {
    if (path == null) {
        this.setParam(opts, KEY_CONFIG_PATHS, null);
    } else {
        String previousPathNames = this.getString(opts, KEY_CONFIG_PATHS);
        if (previousPathNames == null || previousPathNames.isEmpty()) {
            this.setParam(opts, KEY_CONFIG_PATHS, path.toString());
        } else {
            this.setParam(opts, KEY_CONFIG_PATHS, previousPathNames + "," + path.toString());
        }
    }
}

From source file:com.david.mos.out.FileOutputFormat.java

License:Apache License

/**
 * Set the {@link Path} of the output directory for the map-reduce job.
 *
 * @param job The job to modify//w  w w.j a v  a 2 s  .co m
 * @param outputDir the {@link Path} of the output directory for 
 * the map-reduce job.
 */
public static void setOutputPath(Job job, Path outputDir) {
    try {
        outputDir = outputDir.getFileSystem(job.getConfiguration()).makeQualified(outputDir);
    } catch (IOException e) {
        // Throw the IOException as a RuntimeException to be compatible with MR1
        throw new RuntimeException(e);
    }
    job.getConfiguration().set(FileOutputFormat.OUTDIR, outputDir.toString());
}

From source file:com.davidgildeh.hadoop.utils.FileUtils.java

License:Apache License

/**
 * Merges a list of input files in a directory to a single file under the 
 * outputpath with a specified filename//  ww  w  .  j  a v a  2 s .c  o  m
 * 
 * @param inputPath         The input directory containing all the input files. E.g. /input/dir/on/hdfs/
 * @param outputPath        The output path to output the file. E.g. /output/dir/on/hdfs/filename
 * @throws IOException
 */
public static void mergeFiles(String inputPath, String outputPath) throws IOException {

    Path inputDir = new Path(inputPath);
    Path outputFile = new Path(outputPath);
    FileSystem fileSystem = getFileSystem(outputFile);
    checkFileExists(fileSystem, inputDir);

    // Check the input path is a directory
    if (!fileSystem.getFileStatus(inputDir).isDir()) {
        LOG.error("Path '" + inputDir.toString() + "' is not a directory.");
        throw new IOException("Path '" + inputDir.toString() + "' is not a directory.");
    }

    // Create Output File
    OutputStream out = fileSystem.create(outputFile);

    try {

        FileStatus contents[] = fileSystem.listStatus(inputDir);

        // Loop through all files in directory and merge them into one file
        for (int i = 0; i < contents.length; i++) {

            if (!contents[i].isDir()) {

                InputStream in = fileSystem.open(contents[i].getPath());
                try {
                    IOUtils.copyBytes(in, out, fileSystem.getConf(), false);
                } finally {
                    in.close();
                }
            }
        }

    } finally {
        out.close();
        fileSystem.close();
        LOG.info("Merged input files from '" + inputPath + "' to '" + outputPath + "'");
    }
}

From source file:com.davidgildeh.hadoop.utils.FileUtils.java

License:Apache License

/**
 * Check if a file exists, if not will throw a FileNotFoundException
 * //w  w w.j  av  a2s .  co  m
 * @param path              The path of the file to check
 * @throws IOException 
 */
private static void checkFileExists(FileSystem fileSystem, Path path) throws IOException {

    // Check file exists
    if (!fileSystem.exists(path)) {
        LOG.error("Path '" + path.toString() + "' does not exist.");
        fileSystem.close();
        throw new FileNotFoundException("Path '" + path.toString() + "' does not exist.");
    }
}

From source file:com.digitalpebble.behemoth.ClassifierJob.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");
    options.addOption("m", "model", true, "location of the model");

    // parse the command line arguments
    CommandLine line = null;//from w  w  w.  j a  v a2  s .com
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        String model = line.getOptionValue("m");
        if (line.hasOption("help")) {
            formatter.printHelp("ClassifierJob", options);
            return 0;
        }
        if (model == null | input == null | output == null) {
            formatter.printHelp("ClassifierJob", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("ClassifierJob", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));
    String modelPath = line.getOptionValue("m");

    JobConf job = new JobConf(getConf());

    // push the model file to the DistributedCache
    DistributedCache.addCacheArchive(new URI(modelPath), job);

    job.setJarByClass(this.getClass());

    job.setJobName("ClassifierJob : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(TextClassifierMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.set(modelNameParam, modelPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.tika.TikaDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());
    GroupBuilder gBuilder = new GroupBuilder().withName("Options:");
    List<Option> options = new ArrayList<Option>();
    Option inputOpt = buildOption("input", "i", "The input path", true, true, null);
    options.add(inputOpt);/*from   ww w  . j  a  v  a  2  s .  c  o  m*/
    Option outOpt = buildOption("output", "o", "The output path", true, true, null);
    options.add(outOpt);
    Option tikaOpt = buildOption("tikaProcessor", "t",
            "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true,
            false, null);
    options.add(tikaOpt);
    Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, "");
    options.add(mimeTypeOpt);
    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    Group group = gBuilder.create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        // TODO catch exceptions with parsing of opts
        CommandLine cmdLine = parser.parse(args);
        Path inputPath = new Path(cmdLine.getValue(inputOpt).toString());
        Path outputPath = new Path(cmdLine.getValue(outOpt).toString());
        String handlerName = null;
        if (cmdLine.hasOption(tikaOpt)) {
            handlerName = cmdLine.getValue(tikaOpt).toString();
        }

        JobConf job = new JobConf(getConf());
        job.setJarByClass(this.getClass());

        if (cmdLine.hasOption(mimeTypeOpt)) {
            String mimeType = cmdLine.getValue(mimeTypeOpt).toString();
            job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType);
        }

        if (handlerName != null && handlerName.equals("") == false) {
            job.set(TIKA_PROCESSOR_KEY, handlerName);
        }

        job.setJobName("Tika : " + inputPath.toString());

        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BehemothDocument.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BehemothDocument.class);

        job.setMapperClass(TikaMapper.class);

        boolean isFilterRequired = BehemothReducer.isRequired(job);
        if (isFilterRequired)
            job.setReducerClass(BehemothReducer.class);
        else {
            job.setNumReduceTasks(0);
        }

        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        try {
            long start = System.currentTimeMillis();
            JobClient.runJob(job);
            long finish = System.currentTimeMillis();
            if (log.isInfoEnabled()) {
                log.info("TikaDriver completed. Timing: " + (finish - start) + " ms");
            }
        } catch (Exception e) {
            log.error("Exception", e);
            return -1;
            // don't delete the output as some of it could be used
            // fs.delete(outputPath, true);
        } finally {
        }

    } catch (OptionException e) {
        log.error("OptionException", e.getMessage());
        HelpFormatter formatter = new HelpFormatter();
        formatter.setGroup(group);
        formatter.print();
        return -1;
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;/*w w w .  j  a  v  a2 s  .  c o  m*/
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/**
 * Set the array of {@link Path}s as the list of inputs
 * for the map-reduce job.// w  w w .  j a  v a2s. co m
 * 
 * @param job The job to modify 
 * @param inputPaths the {@link Path}s of the input directories/files 
 * for the map-reduce job.
 */
public static void setInputPaths(Job job, Path... inputPaths) throws IOException {
    Configuration conf = job.getConfiguration();
    Path path = inputPaths[0].getFileSystem(conf).makeQualified(inputPaths[0]);
    StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString()));
    for (int i = 1; i < inputPaths.length; i++) {
        str.append(StringUtils.COMMA_STR);
        path = inputPaths[i].getFileSystem(conf).makeQualified(inputPaths[i]);
        str.append(StringUtils.escapeString(path.toString()));
    }
    conf.set(INPUT_DIR, str.toString());
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/**
 * Add a {@link Path} to the list of inputs for the map-reduce job.
 * //from w w  w .  j av  a  2s . c  o m
 * @param job The {@link Job} to modify
 * @param path {@link Path} to be added to the list of inputs for 
 *            the map-reduce job.
 */
public static void addInputPath(Job job, Path path) throws IOException {
    Configuration conf = job.getConfiguration();
    path = path.getFileSystem(conf).makeQualified(path);
    String dirStr = StringUtils.escapeString(path.toString());
    String dirs = conf.get(INPUT_DIR);
    conf.set(INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr);
}

From source file:com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder.java

License:Apache License

/**
 * Specify the input path(s) of a {@link Dataset}.
 * /*from   w w  w .  ja va 2 s. co m*/
 * @param paths one or more path that contain the dataset of
 * @return the builder itself.
 * @throws IOException
 */
public ACTUAL_BUILDER_IMPL addInputPath(Path... paths) throws IOException {
    for (Path anInput : paths) {
        LOGGER.info("Adding an input path:" + anInput.toString());
        if (this.mobiusJob.isOutputOfAnotherJob(anInput)) {
            this.addInputPath(false, anInput);
        } else {
            this.addInputPath(true, anInput);
        }
    }
    return (ACTUAL_BUILDER_IMPL) this;
}