List of usage examples for org.apache.hadoop.fs FileSystem isFile
@Deprecated public boolean isFile(Path f) throws IOException
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been provided. * * @param flowProcess The cascading FlowProcess object for this flow. * @param tap The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none exists. *//* w w w .j ava 2s.c o m*/ private Schema getSourceSchema(FlowProcess<JobConf> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:cascading.scheme.DeprecatedAvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been provided. * * @param flowProcess The cascading FlowProcess object for this flow. * @param tap The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none exists. *///from ww w. j a va 2 s. com private Schema getSourceSchema(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:cascading.tap.hadoop.Hadoop18TapUtil.java
License:Open Source License
private static void moveTaskOutputs(JobConf conf, FileSystem fs, Path jobOutputDir, Path taskOutput) throws IOException { String taskId = conf.get("mapred.task.id"); if (fs.isFile(taskOutput)) { Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); if (!fs.rename(taskOutput, finalOutputPath)) { if (!fs.delete(finalOutputPath, true)) { throw new IOException("Failed to delete earlier output of task: " + taskId); }//from w w w .ja v a 2 s. c o m if (!fs.rename(taskOutput, finalOutputPath)) { throw new IOException("Failed to save output of task: " + taskId); } } LOG.debug("Moved " + taskOutput + " to " + finalOutputPath); } else if (fs.getFileStatus(taskOutput).isDir()) { FileStatus[] paths = fs.listStatus(taskOutput); Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); fs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) { moveTaskOutputs(conf, fs, jobOutputDir, path.getPath()); } } } }
From source file:cascading.tap.hadoop.util.Hadoop18TapUtil.java
License:Open Source License
private static void moveTaskOutputs(Configuration conf, FileSystem fs, Path jobOutputDir, Path taskOutput) throws IOException { String taskId = conf.get("mapred.task.id", conf.get("mapreduce.task.id")); if (fs.isFile(taskOutput)) { Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); if (!fs.rename(taskOutput, finalOutputPath)) { if (!fs.delete(finalOutputPath, true)) throw new IOException("Failed to delete earlier output of task: " + taskId); if (!fs.rename(taskOutput, finalOutputPath)) throw new IOException("Failed to save output of task: " + taskId); }/*from w ww. ja v a 2 s .co m*/ LOG.debug("Moved {} to {}", taskOutput, finalOutputPath); } else if (fs.getFileStatus(taskOutput).isDir()) { FileStatus[] paths = fs.listStatus(taskOutput); Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, getTaskOutputPath(conf)); fs.mkdirs(finalOutputPath); if (paths != null) { for (FileStatus path : paths) moveTaskOutputs(conf, fs, jobOutputDir, path.getPath()); } } }
From source file:cascading.tap.hadoop.ZipInputFormat.java
License:Open Source License
protected Path[] listPathsInternal(JobConf jobConf) throws IOException { Path[] dirs = FileInputFormat.getInputPaths(jobConf); if (dirs.length == 0) throw new IOException("no input paths specified in job"); for (Path dir : dirs) { FileSystem fs = dir.getFileSystem(jobConf); if (!fs.isFile(dir)) throw new IOException("does not support directories: " + dir); }//w ww . ja v a 2 s . c om return dirs; }
From source file:cascading.tap.hadoop.ZipInputFormat.java
License:Open Source License
/** * Splits files returned by {@link #listPathsInternal(JobConf)}. Each file is * expected to be in zip format and each split corresponds to * {@link ZipEntry}.//from ww w. ja va 2 s. c o m * * @param job the JobConf data structure, see {@link JobConf} * @param numSplits the number of splits required. Ignored here * @throws IOException if input files are not in zip format */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("start splitting input ZIP files"); Path[] files = listPathsInternal(job); for (int i = 0; i < files.length; i++) { // check we have valid files Path file = files[i]; FileSystem fs = file.getFileSystem(job); if (!fs.isFile(file) || !fs.exists(file)) throw new IOException("not a file: " + files[i]); } // generate splits ArrayList<ZipSplit> splits = new ArrayList<ZipSplit>(numSplits); for (int i = 0; i < files.length; i++) { Path file = files[i]; FileSystem fs = file.getFileSystem(job); if (LOG.isDebugEnabled()) LOG.debug("opening zip file: " + file.toString()); if (isAllowSplits(fs)) makeSplits(job, splits, fs, file); else makeSplit(job, splits, file); } if (LOG.isDebugEnabled()) LOG.debug("end splitting input ZIP files"); return splits.toArray(new ZipSplit[splits.size()]); }
From source file:cc.slda.DisplayTopic.java
License:Apache License
@SuppressWarnings("unchecked") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(Settings.HELP_OPTION, false, "print the help message"); options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg() .withDescription("input beta file").create(Settings.INPUT_OPTION)); options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg() .withDescription("term index file").create(ParseCorpus.INDEX)); options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg() .withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION)); String betaString = null;/*from w w w . j a v a 2s. c o m*/ String indexString = null; int topDisplay = TOP_DISPLAY; CommandLineParser parser = new GnuParser(); HelpFormatter formatter = new HelpFormatter(); try { CommandLine line = parser.parse(options, args); if (line.hasOption(Settings.HELP_OPTION)) { formatter.printHelp(ParseCorpus.class.getName(), options); System.exit(0); } if (line.hasOption(Settings.INPUT_OPTION)) { betaString = line.getOptionValue(Settings.INPUT_OPTION); } else { throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION + " not initialized..."); } if (line.hasOption(ParseCorpus.INDEX)) { indexString = line.getOptionValue(ParseCorpus.INDEX); } else { throw new ParseException("Parsing failed due to " + ParseCorpus.INDEX + " not initialized..."); } if (line.hasOption(TOP_DISPLAY_OPTION)) { topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION)); } } catch (ParseException pe) { System.err.println(pe.getMessage()); formatter.printHelp(ParseCorpus.class.getName(), options); System.exit(0); } catch (NumberFormatException nfe) { System.err.println(nfe.getMessage()); System.exit(0); } JobConf conf = new JobConf(DisplayTopic.class); FileSystem fs = FileSystem.get(conf); Path indexPath = new Path(indexString); Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path..."); Path betaPath = new Path(betaString); Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path..."); SequenceFile.Reader sequenceFileReader = null; try { IntWritable intWritable = new IntWritable(); Text text = new Text(); Map<Integer, String> termIndex = new HashMap<Integer, String>(); sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf); while (sequenceFileReader.next(intWritable, text)) { termIndex.put(intWritable.get(), text.toString()); } PairOfIntFloat pairOfIntFloat = new PairOfIntFloat(); // HMapIFW hmap = new HMapIFW(); HMapIDW hmap = new HMapIDW(); TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>(); sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf); while (sequenceFileReader.next(pairOfIntFloat, hmap)) { treeMap.clear(); System.out.println("=============================="); System.out.println( "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement()); System.out.println("=============================="); Iterator<Integer> itr1 = hmap.keySet().iterator(); int temp1 = 0; while (itr1.hasNext()) { temp1 = itr1.next(); treeMap.put(-hmap.get(temp1), temp1); if (treeMap.size() > topDisplay) { treeMap.remove(treeMap.lastKey()); } } Iterator<Double> itr2 = treeMap.keySet().iterator(); double temp2 = 0; while (itr2.hasNext()) { temp2 = itr2.next(); if (termIndex.containsKey(treeMap.get(temp2))) { System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2); } else { System.out.println("How embarrassing! Term index not found..."); } } } } finally { IOUtils.closeStream(sequenceFileReader); } return 0; }
From source file:com.alexholmes.hadooputils.sort.Sort.java
License:Apache License
/** * The driver for the sort MapReduce job. * * @param jobConf sort configuration * @param numMapTasks number of map tasks * @param numReduceTasks number of reduce tasks * @param sampler sampler, if required * @param codecClass the compression codec for compressing final outputs * @param mapCodecClass the compression codec for compressing intermediary map outputs * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes * for the job output files * @param inputDirAsString input directory in CSV-form * @param outputDirAsString output directory * @return true if the job completed successfully * @throws IOException if something went wrong * @throws URISyntaxException if a URI wasn't correctly formed *//*from www . ja va2s.c om*/ public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks, final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass, final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes, final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException { jobConf.setJarByClass(Sort.class); jobConf.setJobName("sorter"); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); if (numMapTasks != null) { jobConf.setNumMapTasks(numMapTasks); } if (numReduceTasks != null) { jobConf.setNumReduceTasks(numReduceTasks); } else { int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sortReduces = jobConf.get("test.sort.reduces_per_host"); if (sortReduces != null) { numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces); } // Set user-supplied (possibly default) job configs jobConf.setNumReduceTasks(numReduces); } jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(SortReduce.class); jobConf.setInputFormat(SortInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); if (mapCodecClass != null) { jobConf.setMapOutputCompressorClass(mapCodecClass); } if (codecClass != null) { jobConf.setBoolean("mapred.output.compress", true); jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class); } FileInputFormat.setInputPaths(jobConf, inputDirAsString); FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString)); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); jobConf.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(jobConf)[0]; FileSystem fileSystem = FileSystem.get(jobConf); if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) { inputDir = inputDir.getParent(); } inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile); InputSampler.writePartitionFile(jobConf, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf) + " with " + jobConf.getNumReduceTasks() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds."); if (jobResult.isSuccessful()) { if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) { new LzoIndexer(jobConf).index(new Path(outputDirAsString)); } return true; } return false; }
From source file:com.awcoleman.StandaloneJava.AvroCounterByBlock.java
License:Apache License
public AvroCounterByBlock(String inDirStr) throws IOException { long numAvroRecords = 0; //Get list of input files ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>(); Configuration conf = new Configuration(); conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")); conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less FileSystem hdfs = null; try {/*from ww w .j a v a 2s.co m*/ hdfs = FileSystem.get(conf); } catch (java.io.IOException ioe) { System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage()); System.exit(1); } if (hdfs.getStatus() == null) { System.out.println("Unable to contact HDFS filesystem. Exiting."); System.exit(1); } //Check if input dirs/file exists and get file list (even if list of single file) Path inPath = new Path(inDirStr); if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file inputFileList.add(hdfs.getFileStatus(inPath)); } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir //Get list of input files RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true); while (fileStatusListIterator.hasNext()) { LocatedFileStatus fileStatus = fileStatusListIterator.next(); if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) { inputFileList.add((FileStatus) fileStatus); } } } else { System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting."); System.exit(1); } for (FileStatus thisFileStatus : inputFileList) { //_SUCCESS files are 0 bytes if (thisFileStatus.getLen() == 0) { continue; } DataFileStream<Object> dfs = null; FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath()); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); dfs = new DataFileStream<Object>(inStream, reader); long thisFileRecords = 0; while (dfs.hasNext()) { numAvroRecords = numAvroRecords + dfs.getBlockCount(); thisFileRecords = thisFileRecords + dfs.getBlockCount(); //System.out.println("Input file "+thisFileStatus.getPath()+" getBlockCount() is "+dfs.getBlockCount()+"." ); dfs.nextBlock(); } System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records."); dfs.close(); inStream.close(); //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die. } System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and " + numAvroRecords + " total records."); }
From source file:com.awcoleman.StandaloneJava.AvroCounterByRecord.java
License:Apache License
public AvroCounterByRecord(String inDirStr) throws IOException { long numAvroRecords = 0; //Get list of input files ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>(); Configuration conf = new Configuration(); conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")); conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less FileSystem hdfs = null; try {/*from w ww . ja va2 s . c o m*/ hdfs = FileSystem.get(conf); } catch (java.io.IOException ioe) { System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage()); System.exit(1); } if (hdfs.getStatus() == null) { System.out.println("Unable to contact HDFS filesystem. Exiting."); System.exit(1); } //Check if input dirs/file exists and get file list (even if list of single file) Path inPath = new Path(inDirStr); if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file inputFileList.add(hdfs.getFileStatus(inPath)); } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir //Get list of input files RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true); while (fileStatusListIterator.hasNext()) { LocatedFileStatus fileStatus = fileStatusListIterator.next(); if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) { inputFileList.add((FileStatus) fileStatus); } } } else { System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting."); System.exit(1); } for (FileStatus thisFileStatus : inputFileList) { //_SUCCESS files are 0 bytes if (thisFileStatus.getLen() == 0) { continue; } DataFileStream<Object> avroStream = null; FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath()); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); avroStream = new DataFileStream<Object>(inStream, reader); long thisFileRecords = 0; while (avroStream.hasNext()) { numAvroRecords++; thisFileRecords++; avroStream.next(); } avroStream.close(); inStream.close(); System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records."); //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die. } System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and " + numAvroRecords + " total records."); }