List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.david.mos.out.FileOutputFormat.java
License:Apache License
/** * Set the {@link Path} of the output directory for the map-reduce job. * * @param job The job to modify/* w w w . j a v a2s. c o m*/ * @param outputDir the {@link Path} of the output directory for * the map-reduce job. */ public static void setOutputPath(Job job, Path outputDir) { try { outputDir = outputDir.getFileSystem(job.getConfiguration()).makeQualified(outputDir); } catch (IOException e) { // Throw the IOException as a RuntimeException to be compatible with MR1 throw new RuntimeException(e); } job.getConfiguration().set(FileOutputFormat.OUTDIR, outputDir.toString()); }
From source file:com.davidgildeh.hadoop.utils.FileUtils.java
License:Apache License
/** * Opens the HDFS FileSystem so file operations can be run. Configuration is * loaded and will automatically load Hadoop environment settings * /*from w w w . java2 s .com*/ * @return The HDFS FileSystem, null if failure * @throws IOException */ private static FileSystem getFileSystem(Path filePath) throws IOException { // Check if we have local Configuration for HDFS Set, if not it will default to local file system Configuration conf = new Configuration(); if (System.getenv("HADOOP_HOME") != null) { LOG.info("Loading Hadoop Configuration Files under " + System.getenv("HADOOP_HOME")); Path coreSitePath = new Path(System.getenv("HADOOP_HOME"), "conf/core-site.xml"); conf.addResource(coreSitePath); Path hdfsSitePath = new Path(System.getenv("HADOOP_HOME"), "conf/hdfs-site.xml"); conf.addResource(hdfsSitePath); } else { LOG.info("HADOOP_HOME Not Set. Using Local File System."); } return filePath.getFileSystem(conf); }
From source file:com.digitalpebble.behemoth.gate.GATECorpusGenerator.java
License:Apache License
private void generateXMLdocs(String inputf, String outputf) throws IOException { Path input = new Path(inputf); File output = new File(outputf); if (output.exists() && output.isFile()) { System.err.println("Output " + outputf + " already exists"); return;//from w ww . j ava2s. co m } if (output.exists() == false) output.mkdirs(); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateXMLdocs(suPath, output, count); } }
From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java
License:Apache License
public int run(String[] args) throws Exception { String usage = "Usage: SegmentConverter [-dir segdir | segment] output"; if (args.length < 2) { System.err.println(usage); System.exit(-1);/* www .j ava 2 s .co m*/ } final List<Path> segments = new ArrayList<Path>(); if (args[0].equals("-dir")) { Path dir = new Path(args[1]); FileSystem fs = dir.getFileSystem(getConf()); FileStatus[] fstats = fs.listStatus(dir, HadoopFSUtil.getPassDirectoriesFilter(fs)); Path[] files = HadoopFSUtil.getPaths(fstats); for (Path p : files) { segments.add(p); } } else { segments.add(new Path(args[0])); } Path output = new Path(args[args.length - 1]); convert(segments, output); return 0; }
From source file:com.digitalpebble.behemoth.util.ContentExtractor.java
License:Apache License
private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException { Path input = new Path(inputf); Path dirPath = new Path(outputf); FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); if (fsout.exists(dirPath) == false) fsout.mkdirs(dirPath);//from w w w .j a va 2s . co m else { System.err.println("Output " + outputf + " already exists"); return -1; } // index file Path indexPath = new Path(dirPath, "index"); if (fsout.exists(indexPath) == false) { fsout.createNewFile(indexPath); } maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000); index = fsout.create(indexPath); createArchive(dirPath); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateDocs(suPath, dirPath, count); } if (index != null) index.close(); if (currentArchive != null) { currentArchive.finish(); currentArchive.close(); } return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusGenerator.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input file or directory"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("r", "recurse", true, "processes directories recursively (default true)"); options.addOption("u", "unpack", true, "unpack content of archives (default true)"); options.addOption("md", "metadata", true, "add document metadata separated by semicolon e.g. -md source=internet;label=public"); // parse the command line arguments CommandLine line = null;/*from ww w . j a v a 2s. c om*/ try { line = parser.parse(options, args); if (line.hasOption("help")) { formatter.printHelp("CorpusGenerator", options); return 0; } if (!line.hasOption("i")) { formatter.printHelp("CorpusGenerator", options); return -1; } if (!line.hasOption("o")) { formatter.printHelp("CorpusGenerator", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusGenerator", options); } boolean recurse = true; if (line.hasOption("r") && "false".equalsIgnoreCase(line.getOptionValue("r"))) recurse = false; boolean unpack = true; if (line.hasOption("u") && "false".equalsIgnoreCase(line.getOptionValue("u"))) unpack = false; getConf().setBoolean(unpackParamName, unpack); Path inputDir = new Path(line.getOptionValue("i")); Path output = new Path(line.getOptionValue("o")); if (line.hasOption("md")) { String md = line.getOptionValue("md"); getConf().set("md", md); } setInput(inputDir); setOutput(output); long start = System.currentTimeMillis(); if (inputDir.getFileSystem(getConf()).exists(inputDir) == false) { log.error("Input does not exist : " + inputDir); return -1; } long count = generate(recurse); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("CorpusGenerator completed. Timing: " + (finish - start) + " ms"); } log.info(count + " docs converted"); return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusGenerator.java
License:Apache License
private static long processFiles(Configuration conf, Path input, boolean recurse, PerformanceFileFilter pff) throws IOException { FileSystem fs = input.getFileSystem(conf); FileStatus[] statuses = fs.listStatus(input, pff); for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; if (recurse == true) { processFiles(conf, status.getPath(), recurse, pff); }/*from w w w .java 2 s . com*/ } return pff.counter; }
From source file:com.digitalpebble.behemoth.util.CorpusReader.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("c", "displayContent", false, "display binary content in output"); options.addOption("t", "displayText", false, "display text in output"); options.addOption("a", "displayAnnotations", false, "display annotations in output"); options.addOption("m", "displayMetadata", false, "display metadata in output"); // parse the command line arguments CommandLine line = null;//from w w w. j a v a2s.c o m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); if (line.hasOption("help")) { formatter.printHelp("CorpusReader", options); return 0; } if (input == null) { formatter.printHelp("CorpusReader", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusReader", options); return -1; } boolean showBinaryContent = line.hasOption("displayContent"); boolean showText = line.hasOption("displayText"); boolean showAnnotations = line.hasOption("displayAnnotations"); boolean showMD = line.hasOption("displayMetadata"); Path inputPath = new Path(line.getOptionValue("i")); Configuration conf = getConf(); FileSystem fs = inputPath.getFileSystem(conf); // filter input DocumentFilter filters = DocumentFilter.getFilters(conf); boolean doFilter = DocumentFilter.isRequired(conf); FileStatus[] fss = fs.listStatus(inputPath); for (FileStatus status : fss) { Path path = status.getPath(); // skips the _log or _SUCCESS files if (!path.getName().startsWith("part-") && !path.getName().equals(inputPath.getName())) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Text key = new Text(); BehemothDocument value = new BehemothDocument(); while (reader.next(key, value)) { // skip this document? if (doFilter && filters.keep(value) == false) continue; System.out.println(value.toString(showBinaryContent, showAnnotations, showText, showMD)); } reader.close(); } return 0; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** List input directories. * Subclasses may override to, e.g., select only files matching a regular * expression. // w w w . j ava 2 s . c o m * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); // creates a MultiPathFilter with the hiddenFileFilter and the // user provided one (if any). List<PathFilter> filters = new ArrayList<PathFilter>(); filters.add(hiddenFileFilter); PathFilter jobFilter = getInputPathFilter(job); if (jobFilter != null) { filters.add(jobFilter); } PathFilter inputFilter = new MultiPathFilter(filters); for (int i = 0; i < dirs.length; ++i) { Path p = dirs[i]; FileSystem fs = p.getFileSystem(job.getConfiguration()); FileStatus[] matches = fs.globStatus(p, inputFilter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { for (FileStatus stat : fs.listStatus(globStat.getPath(), inputFilter)) { result.add(stat); } } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } LOG.info("Total input paths to process : " + result.size()); return result; }
From source file:com.dinglicom.clouder.mapreduce.input.FileInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/*w ww . j a va 2 s. c o m*/ * @throws IOException */ public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { FileSystem fs = path.getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // not splitable splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts())); } } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }