Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.david.mos.out.FileOutputFormat.java

License:Apache License

/**
 * Set the {@link Path} of the output directory for the map-reduce job.
 *
 * @param job The job to modify/* w  w  w  .  j a  v a2s.  c o  m*/
 * @param outputDir the {@link Path} of the output directory for 
 * the map-reduce job.
 */
public static void setOutputPath(Job job, Path outputDir) {
    try {
        outputDir = outputDir.getFileSystem(job.getConfiguration()).makeQualified(outputDir);
    } catch (IOException e) {
        // Throw the IOException as a RuntimeException to be compatible with MR1
        throw new RuntimeException(e);
    }
    job.getConfiguration().set(FileOutputFormat.OUTDIR, outputDir.toString());
}

From source file:com.davidgildeh.hadoop.utils.FileUtils.java

License:Apache License

/**
 * Opens the HDFS FileSystem so file operations can be run. Configuration is
 * loaded and will automatically load Hadoop environment settings
 * /*from   w w w .  java2  s  .com*/
 * @return  The HDFS FileSystem, null if failure 
 * @throws IOException
 */
private static FileSystem getFileSystem(Path filePath) throws IOException {

    // Check if we have local Configuration for HDFS Set, if not it will default to local file system
    Configuration conf = new Configuration();
    if (System.getenv("HADOOP_HOME") != null) {
        LOG.info("Loading Hadoop Configuration Files under " + System.getenv("HADOOP_HOME"));
        Path coreSitePath = new Path(System.getenv("HADOOP_HOME"), "conf/core-site.xml");
        conf.addResource(coreSitePath);
        Path hdfsSitePath = new Path(System.getenv("HADOOP_HOME"), "conf/hdfs-site.xml");
        conf.addResource(hdfsSitePath);
    } else {
        LOG.info("HADOOP_HOME Not Set. Using Local File System.");
    }

    return filePath.getFileSystem(conf);
}

From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java

License:Apache License

private void generateXMLdocs(String inputf, String outputf) throws IOException {
    Path input = new Path(inputf);

    File output = new File(outputf);
    if (output.exists() && output.isFile()) {
        System.err.println("Output " + outputf + " already exists");
        return;//from  w ww  .  j ava2s.  co m
    }
    if (output.exists() == false)
        output.mkdirs();

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateXMLdocs(suPath, output, count);
    }
}

From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java

License:Apache License

public int run(String[] args) throws Exception {

    String usage = "Usage: SegmentConverter [-dir segdir | segment] output";

    if (args.length < 2) {
        System.err.println(usage);
        System.exit(-1);/*  www  .j  ava  2 s .co m*/
    }

    final List<Path> segments = new ArrayList<Path>();

    if (args[0].equals("-dir")) {
        Path dir = new Path(args[1]);
        FileSystem fs = dir.getFileSystem(getConf());
        FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs));
        Path[] files = HadoopFSUtil.getPaths(fstats);
        for (Path p : files) {
            segments.add(p);
        }
    }

    else {
        segments.add(new Path(args[0]));
    }

    Path output = new Path(args[args.length - 1]);
    convert(segments, output);
    return 0;
}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException {

    Path input = new Path(inputf);
    Path dirPath = new Path(outputf);

    FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf());

    if (fsout.exists(dirPath) == false)
        fsout.mkdirs(dirPath);//from w  w w  .j  a  va 2s  . co  m
    else {
        System.err.println("Output " + outputf + " already exists");
        return -1;
    }

    // index file
    Path indexPath = new Path(dirPath, "index");
    if (fsout.exists(indexPath) == false) {
        fsout.createNewFile(indexPath);
    }

    maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000);

    index = fsout.create(indexPath);

    createArchive(dirPath);

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateDocs(suPath, dirPath, count);
    }

    if (index != null)
        index.close();

    if (currentArchive != null) {
        currentArchive.finish();
        currentArchive.close();
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusGenerator.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input file or directory");
    options.addOption("o", "output", true, "output Behemoth corpus");
    options.addOption("r", "recurse", true, "processes directories recursively (default true)");
    options.addOption("u", "unpack", true, "unpack content of archives (default true)");
    options.addOption("md", "metadata", true,
            "add document metadata separated by semicolon e.g. -md source=internet;label=public");

    // parse the command line arguments
    CommandLine line = null;/*from  ww  w  . j a  v  a  2s. c  om*/
    try {
        line = parser.parse(options, args);
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusGenerator", options);
            return 0;
        }
        if (!line.hasOption("i")) {
            formatter.printHelp("CorpusGenerator", options);
            return -1;
        }
        if (!line.hasOption("o")) {
            formatter.printHelp("CorpusGenerator", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusGenerator", options);
    }

    boolean recurse = true;
    if (line.hasOption("r") && "false".equalsIgnoreCase(line.getOptionValue("r")))
        recurse = false;
    boolean unpack = true;
    if (line.hasOption("u") && "false".equalsIgnoreCase(line.getOptionValue("u")))
        unpack = false;

    getConf().setBoolean(unpackParamName, unpack);

    Path inputDir = new Path(line.getOptionValue("i"));
    Path output = new Path(line.getOptionValue("o"));

    if (line.hasOption("md")) {
        String md = line.getOptionValue("md");
        getConf().set("md", md);
    }

    setInput(inputDir);
    setOutput(output);

    long start = System.currentTimeMillis();
    if (inputDir.getFileSystem(getConf()).exists(inputDir) == false) {
        log.error("Input does not exist : " + inputDir);
        return -1;
    }
    long count = generate(recurse);
    long finish = System.currentTimeMillis();
    if (log.isInfoEnabled()) {
        log.info("CorpusGenerator completed. Timing: " + (finish - start) + " ms");
    }
    log.info(count + " docs converted");

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusGenerator.java

License:Apache License

private static long processFiles(Configuration conf, Path input, boolean recurse, PerformanceFileFilter pff)
        throws IOException {

    FileSystem fs = input.getFileSystem(conf);
    FileStatus[] statuses = fs.listStatus(input, pff);
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        if (recurse == true) {
            processFiles(conf, status.getPath(), recurse, pff);
        }/*from w  w w  .java  2 s  . com*/
    }
    return pff.counter;
}

From source file:com.digitalpebble.behemoth.util.CorpusReader.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("c", "displayContent", false, "display binary content in output");
    options.addOption("t", "displayText", false, "display text in output");
    options.addOption("a", "displayAnnotations", false, "display annotations in output");
    options.addOption("m", "displayMetadata", false, "display metadata in output");

    // parse the command line arguments
    CommandLine line = null;//from w w  w.  j  a v a2s.c  o m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusReader", options);
            return 0;
        }
        if (input == null) {
            formatter.printHelp("CorpusReader", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusReader", options);
        return -1;
    }

    boolean showBinaryContent = line.hasOption("displayContent");
    boolean showText = line.hasOption("displayText");
    boolean showAnnotations = line.hasOption("displayAnnotations");
    boolean showMD = line.hasOption("displayMetadata");

    Path inputPath = new Path(line.getOptionValue("i"));

    Configuration conf = getConf();
    FileSystem fs = inputPath.getFileSystem(conf);

    // filter input
    DocumentFilter filters = DocumentFilter.getFilters(conf);
    boolean doFilter = DocumentFilter.isRequired(conf);

    FileStatus[] fss = fs.listStatus(inputPath);
    for (FileStatus status : fss) {
        Path path = status.getPath();
        // skips the _log or _SUCCESS files
        if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName()))
            continue;
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        Text key = new Text();
        BehemothDocument value = new BehemothDocument();
        while (reader.next(key, value)) {
            // skip this document?
            if (doFilter && filters.keep(value) == false)
                continue;

            System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD));
        }
        reader.close();
    }

    return 0;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/** List input directories.
 * Subclasses may override to, e.g., select only files matching a regular
 * expression. //  w  w  w .  j ava 2 s . c  o m
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();

    // creates a MultiPathFilter with the hiddenFileFilter and the
    // user provided one (if any).
    List<PathFilter> filters = new ArrayList<PathFilter>();
    filters.add(hiddenFileFilter);
    PathFilter jobFilter = getInputPathFilter(job);
    if (jobFilter != null) {
        filters.add(jobFilter);
    }
    PathFilter inputFilter = new MultiPathFilter(filters);

    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) {
                        result.add(stat);
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    LOG.info("Total input paths to process : " + result.size());
    return result;
}

From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*w  ww .  j a va 2 s. c  o  m*/
 * @throws IOException
 */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize,
                            blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                }

                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}