List of usage examples for org.apache.hadoop.fs FileStatus isDirectory
public boolean isDirectory()
From source file:org.trustedanalytics.cfbroker.store.hdfs.service.SimpleHdfsClient.java
License:Apache License
@Override public List<String> listFiles(String path, boolean recursive) throws IOException { List<String> files = new ArrayList<>(); FileStatus[] statuses = fs.listStatus(new Path(path)); for (FileStatus status : statuses) { files.add(status.getPath().toString()); if (status.isDirectory() && recursive) files.addAll(listFiles(status.getPath().toString(), recursive)); }//from www. j a va2 s .co m return files; }
From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * @param job the job context/* w w w . j a va2 s . com*/ * @throws IOException */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { LOG.info("StreamWikiDumpInputFormat.getSplits job=" + job + " n=" + numSplits); InputSplit[] oldSplits = super.getSplits(job, numSplits); List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1); long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); for (FileStatus file : files) { if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); LOG.info(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize)); //System.err.println(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize)); for (InputSplit x : getSplits(job, file, pageBeginPattern, splitSize)) splits.add(x); } System.err.println("splits=" + splits); return splits.toArray(new InputSplit[splits.size()]); }
From source file:stewi.mapred.LenientSequenceFileInputFormat.java
License:Apache License
@Override protected FileStatus[] listStatus(JobConf job) throws IOException { FileStatus[] files = super.listStatus(job); for (int i = 0; i < files.length; i++) { FileStatus file = files[i]; if (file.isDirectory()) { // it's a MapFile Path dataFile = new Path(file.getPath(), MapFile.DATA_FILE_NAME); FileSystem fs = file.getPath().getFileSystem(job); // use the data file files[i] = fs.getFileStatus(dataFile); }/*from w w w .j a v a 2 s .c om*/ } return files; }
From source file:streaming.core.HDFSTarEntry.java
License:Apache License
public HDFSTarEntry(FileStatus hdfsFileStatus, String entryName) { super(null, entryName); this.hdfsFileStatus = hdfsFileStatus; header = TarHeader.createHeader(entryName, hdfsFileStatus.getLen(), hdfsFileStatus.getModificationTime() / 1000, hdfsFileStatus.isDirectory()); }
From source file:terasort.io.TeraOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext job) throws IOException { // Ensure that the output directory is set Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set in JobConf."); }/* w ww.ja v a 2 s . co m*/ final Configuration jobConf = job.getConfiguration(); // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, jobConf); final FileSystem fs = outDir.getFileSystem(jobConf); if (fs.exists(outDir)) { // existing output dir is considered empty iff its only content is the // partition file. // final FileStatus[] outDirKids = fs.listStatus(outDir); boolean empty = false; if (outDirKids != null && outDirKids.length == 1) { final FileStatus st = outDirKids[0]; final String fname = st.getPath().getName(); empty = !st.isDirectory(); } if (!empty) { throw new FileAlreadyExistsException("Output directory " + outDir + " already exists"); } } }
From source file:wanggang1987.bigdataapi.hadoopapi.HadoopClientAPI.java
/** * list all file//w w w . j a v a 2 s . c om * * @param path * @return * @throws IllegalArgumentException */ public ArrayList<String> listFiles(String path) { ArrayList<String> list = null; try { list = new ArrayList<>(); FileStatus[] files = hdfs.listStatus(new Path(path)); for (FileStatus file : files) { if (file.isDirectory() == false) { list.add(file.getPath().toString()); } } } catch (IllegalArgumentException | IOException e) { logger.error("listFiles failed", e); } return list; }
From source file:wherehows.SchemaFetch.java
License:Open Source License
/** * Decide whether this is a dataset by it's sub directories format * * @param path// w w w . j a v a2 s . c o m * @return 1 : empty dataset or lowest level dataset * 0 : may have sub dataset * < 0 : error * @throws java.io.IOException * @throws AccessControlException */ private static int isTable(Path path, FileSystem fs) throws IOException, AccessControlException { int hiddenFileCount = 0; int datePartitionCount = 0; int dataSetCount = 0; int fileCount = 0; int i = 0; String objName; try { // System.err.println(" Probing " + path.toString()); for (FileStatus fstat : fs.listStatus(path)) { objName = fstat.getPath().getName(); if (!fstat.isDirectory()) { // file fileCount++; } else if (objName.matches("(_|\\.|tmp|temp|_distcp|backup|\\*|test|trash).*")) { // hidden/temporary fs object hiddenFileCount++; } else if (objName.matches( "daily|hourly|hourly.deduped|monthly|weekly|(ds|dt|datepartition|year|month|date)=[0-9-]+")) { // temporal partition type datePartitionCount++; } else if (objName.matches( "[0-9\\-_]+\\w+[0-9\\-_]+|\\w+_day=[0-9\\-_]+|\\p{Alnum}+=[0-9\\-_]+|[0-9\\-_]+|[0-9]{14}_\\w+|[0-9]{8}_\\w+|[0-9]{4}-[0-9]{2}-[0-9]{2}.*")) { // temporal datePartitionCount++; } else { // sub directory dataSetCount++; } i++; } // end of for fstat } // end of try catch (AccessControlException e) { return -1; // Usually there is a permission issue } catch (IOException e) { return -2; } catch (Exception e) { return -3; } // System.err.println(" -- isTable(" + path.toString() + ") i=" + i + " datePartition=" + datePartitionCount + " dataSet=" + dataSetCount); if (i == 0 || dataSetCount == 0) { return 1; } else if (i > 0 && datePartitionCount > 0) { return 1; } else { return 0; } }
From source file:wherehows.SchemaFetch.java
License:Open Source License
/** * Collect one dataset's metadata//from w ww. ja v a 2s . c o m * * @param path * @throws java.io.IOException */ private static void traceTableInfo(Path path, FileSystem tranceFs) throws IOException, SQLException { logger.info("trace table : " + path.toUri().getPath()); // analyze the pattern of the name String tbl_name = path.getName(); if (tbl_name.matches("(_|\\.|tmp|temp|stg|test|\\*).*")) // skip _temporary _schema.avsc { return; } FileStatus[] fstat_lst; FileStatus fstat = tranceFs.getFileStatus(path); String fullPath = path.toUri().getPath(); String xName = ""; long data_size = -1; long sample_data_size = -1; int i, x; // String data_source = checkDataSource(fullPath); // TODO this part need to rewrite try { while (fstat.isDirectory()) { fstat_lst = tranceFs.listStatus(fstat.getPath()); // list all children if (fstat_lst.length == 0) { // empty directory logger.info(fstat.getPath().toUri().getPath() + " is empty."); return; } int is_fstat_visible = 0; for (i = fstat_lst.length - 1; i >= 0; i--) { // iterate from the last item back to the first fstat = fstat_lst[i]; // start from the last file in the list xName = fstat.getPath().getName(); if (xName.matches("\\.pig_schema|.*\\.avsc|\\.dataset")) { is_fstat_visible = 1; break; } else if (xName.equals("hourly") && i > 0 && fstat_lst[i - 1].getPath().getName().equals("daily")) { continue; // try to traverse "daily" instead of "hourly" when possible } else if (xName.matches("(_|\\.|tmp|temp).*")) { continue; } try { // sub directory may be inaccessible sample_data_size = fstat.isDirectory() ? tranceFs.getContentSummary(fstat.getPath()).getLength() : fstat.getLen(); } catch (AccessControlException e) { if (tranceFs.listStatus(fstat.getPath()).length > 0) { is_fstat_visible = 1; break; } else { continue; } } if (fstat.isDirectory() == false && xName.matches("(_|\\.).*|.*\\.(jar|json|txt|csv|tsv|zip|gz|lzo)") == false) { is_fstat_visible = 1; break; } // if fstat is a Directory if (fstat.isDirectory() == true && xName.matches("(_|\\.).*") == false) { is_fstat_visible = 1; break; } } // logger.info(fstat.getPath() + "is_fstat_visible : " + is_fstat_visible); if (is_fstat_visible == 0) { return; } } } catch (AccessControlException e) { logger.error("* TblInfo() Cannot access " + fstat.getPath().toUri().getPath()); return; } // get schema and sample data DatasetJsonRecord datasetSchemaRecord = fileAnalyzerFactory.getSchema(fstat.getPath(), path.toUri().getPath()); if (datasetSchemaRecord != null) { schemaFileWriter.append(datasetSchemaRecord); } else { logger.error("* Cannot resolve the schema of " + fullPath); } SampleDataRecord sampleDataRecord = fileAnalyzerFactory.getSampleData(fstat.getPath(), path.toUri().getPath()); if (sampleDataRecord != null) { sampleFileWriter.append(sampleDataRecord); } else { System.err.println("* Cannot fetch sample data of " + fullPath); } }
From source file:wikiduper.clir.rp.TextDocnoMappingBuilder.java
License:Apache License
@Override public int run(String[] args) throws IOException { DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args); System.out.println("WTF WTF args: " + Arrays.toString(args)); if (options == null) { return -1; }//from w w w .jav a 2 s . c om // Temp directory. String tmpDir = "tmp-" + TextDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000); LOG.info("Tool name: " + TextDocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - input path: " + options.collection); LOG.info(" - output file: " + options.docnoMapping); Job job = Job.getInstance(getConf()); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(TextDocnoMappingBuilder.class); job.setNumReduceTasks(1); PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; // Note: Gov2 and Wt10g raw collections are organized into sub-directories. Path collectionPath = new Path(options.collection); for (FileStatus status : fs.listStatus(collectionPath, filter)) { if (status.isDirectory()) { for (FileStatus s : fs.listStatus(status.getPath(), filter)) { FileInputFormat.addInputPath(job, s.getPath()); } } else { FileInputFormat.addInputPath(job, status.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(TextInputFormat.class); //options.inputFormat); LOG.info("Input format : " + options.inputFormat); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); LOG.info("Here1\n"); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpDir), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs); fs.delete(new Path(tmpDir), true); return 0; }