Example usage for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory()

Source Link

Document

Is this a directory?

Usage

From source file:org.trustedanalytics.cfbroker.store.hdfs.service.SimpleHdfsClient.java

License:Apache License

@Override
public List<String> listFiles(String path, boolean recursive) throws IOException {
    List<String> files = new ArrayList<>();
    FileStatus[] statuses = fs.listStatus(new Path(path));

    for (FileStatus status : statuses) {
        files.add(status.getPath().toString());
        if (status.isDirectory() && recursive)
            files.addAll(listFiles(status.getPath().toString(), recursive));
    }//from www.  j a va2 s .co m

    return files;
}

From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*  w  w  w  .  j a  va2  s  . com*/
 * @throws IOException
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.info("StreamWikiDumpInputFormat.getSplits job=" + job + " n=" + numSplits);
    InputSplit[] oldSplits = super.getSplits(job, numSplits);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }
    long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1);
    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    for (FileStatus file : files) {
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        long blockSize = file.getBlockSize();
        long splitSize = computeSplitSize(goalSize, minSize, blockSize);
        LOG.info(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize));
        //System.err.println(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize));
        for (InputSplit x : getSplits(job, file, pageBeginPattern, splitSize))
            splits.add(x);
    }
    System.err.println("splits=" + splits);
    return splits.toArray(new InputSplit[splits.size()]);
}

From source file:stewi.mapred.LenientSequenceFileInputFormat.java

License:Apache License

@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
    FileStatus[] files = super.listStatus(job);
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];
        if (file.isDirectory()) { // it's a MapFile
            Path dataFile = new Path(file.getPath(), MapFile.DATA_FILE_NAME);
            FileSystem fs = file.getPath().getFileSystem(job);
            // use the data file
            files[i] = fs.getFileStatus(dataFile);
        }/*from   w w w  .j  a v a 2 s  .c  om*/
    }
    return files;
}

From source file:streaming.core.HDFSTarEntry.java

License:Apache License

public HDFSTarEntry(FileStatus hdfsFileStatus, String entryName) {
    super(null, entryName);
    this.hdfsFileStatus = hdfsFileStatus;
    header = TarHeader.createHeader(entryName, hdfsFileStatus.getLen(),
            hdfsFileStatus.getModificationTime() / 1000, hdfsFileStatus.isDirectory());
}

From source file:terasort.io.TeraOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext job) throws IOException {
    // Ensure that the output directory is set
    Path outDir = getOutputPath(job);
    if (outDir == null) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }/*  w  ww.ja  v a 2  s  . co m*/

    final Configuration jobConf = job.getConfiguration();

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, jobConf);

    final FileSystem fs = outDir.getFileSystem(jobConf);

    if (fs.exists(outDir)) {
        // existing output dir is considered empty iff its only content is the
        // partition file.
        //
        final FileStatus[] outDirKids = fs.listStatus(outDir);
        boolean empty = false;
        if (outDirKids != null && outDirKids.length == 1) {
            final FileStatus st = outDirKids[0];
            final String fname = st.getPath().getName();
            empty = !st.isDirectory();
        }
        if (!empty) {
            throw new FileAlreadyExistsException("Output directory " + outDir + " already exists");
        }
    }
}

From source file:wanggang1987.bigdataapi.hadoopapi.HadoopClientAPI.java

/**
 * list all file//w w w  . j a  v a  2 s  . c om
 *
 * @param path
 * @return
 * @throws IllegalArgumentException
 */
public ArrayList<String> listFiles(String path) {
    ArrayList<String> list = null;
    try {
        list = new ArrayList<>();
        FileStatus[] files = hdfs.listStatus(new Path(path));
        for (FileStatus file : files) {
            if (file.isDirectory() == false) {
                list.add(file.getPath().toString());
            }
        }
    } catch (IllegalArgumentException | IOException e) {
        logger.error("listFiles failed", e);
    }
    return list;
}

From source file:wherehows.SchemaFetch.java

License:Open Source License

/**
 * Decide whether this is a dataset by it's sub directories format
 *
 * @param path//  w  w w  .  j  a v  a2  s  . c  o m
 * @return 1 : empty dataset or lowest level dataset
 * 0 : may have sub dataset
 * < 0 : error
 * @throws java.io.IOException
 * @throws AccessControlException
 */
private static int isTable(Path path, FileSystem fs) throws IOException, AccessControlException {
    int hiddenFileCount = 0;
    int datePartitionCount = 0;
    int dataSetCount = 0;
    int fileCount = 0;
    int i = 0;
    String objName;

    try {
        // System.err.println("  Probing " + path.toString());
        for (FileStatus fstat : fs.listStatus(path)) {
            objName = fstat.getPath().getName();

            if (!fstat.isDirectory()) {
                // file
                fileCount++;
            } else if (objName.matches("(_|\\.|tmp|temp|_distcp|backup|\\*|test|trash).*")) {
                // hidden/temporary fs object
                hiddenFileCount++;
            } else if (objName.matches(
                    "daily|hourly|hourly.deduped|monthly|weekly|(ds|dt|datepartition|year|month|date)=[0-9-]+")) {
                // temporal partition type
                datePartitionCount++;
            } else if (objName.matches(
                    "[0-9\\-_]+\\w+[0-9\\-_]+|\\w+_day=[0-9\\-_]+|\\p{Alnum}+=[0-9\\-_]+|[0-9\\-_]+|[0-9]{14}_\\w+|[0-9]{8}_\\w+|[0-9]{4}-[0-9]{2}-[0-9]{2}.*")) {
                // temporal
                datePartitionCount++;
            } else {
                // sub directory
                dataSetCount++;
            }
            i++;
        } // end of for fstat
    } // end of try
    catch (AccessControlException e) {
        return -1; // Usually there is a permission issue
    } catch (IOException e) {
        return -2;
    } catch (Exception e) {
        return -3;
    }

    // System.err.println("  -- isTable(" + path.toString() + ") i=" + i + " datePartition=" + datePartitionCount + " dataSet=" + dataSetCount);
    if (i == 0 || dataSetCount == 0) {
        return 1;
    } else if (i > 0 && datePartitionCount > 0) {
        return 1;
    } else {
        return 0;
    }
}

From source file:wherehows.SchemaFetch.java

License:Open Source License

/**
 * Collect one dataset's metadata//from  w  ww. ja v  a 2s . c o m
 *
 * @param path
 * @throws java.io.IOException
 */
private static void traceTableInfo(Path path, FileSystem tranceFs) throws IOException, SQLException {
    logger.info("trace table : " + path.toUri().getPath());
    // analyze the pattern of the name
    String tbl_name = path.getName();
    if (tbl_name.matches("(_|\\.|tmp|temp|stg|test|\\*).*")) // skip _temporary _schema.avsc
    {
        return;
    }

    FileStatus[] fstat_lst;
    FileStatus fstat = tranceFs.getFileStatus(path);
    String fullPath = path.toUri().getPath();
    String xName = "";
    long data_size = -1;
    long sample_data_size = -1;
    int i, x;
    // String data_source = checkDataSource(fullPath);

    // TODO this part need to rewrite
    try {
        while (fstat.isDirectory()) {

            fstat_lst = tranceFs.listStatus(fstat.getPath()); // list all children
            if (fstat_lst.length == 0) { // empty directory
                logger.info(fstat.getPath().toUri().getPath() + " is empty.");
                return;
            }

            int is_fstat_visible = 0;
            for (i = fstat_lst.length - 1; i >= 0; i--) { // iterate from the last item back to the first
                fstat = fstat_lst[i]; // start from the last file in the list
                xName = fstat.getPath().getName();

                if (xName.matches("\\.pig_schema|.*\\.avsc|\\.dataset")) {
                    is_fstat_visible = 1;
                    break;
                } else if (xName.equals("hourly") && i > 0
                        && fstat_lst[i - 1].getPath().getName().equals("daily")) {
                    continue; // try to traverse "daily" instead of "hourly" when possible
                } else if (xName.matches("(_|\\.|tmp|temp).*")) {
                    continue;
                }

                try { // sub directory may be inaccessible
                    sample_data_size = fstat.isDirectory()
                            ? tranceFs.getContentSummary(fstat.getPath()).getLength()
                            : fstat.getLen();
                } catch (AccessControlException e) {
                    if (tranceFs.listStatus(fstat.getPath()).length > 0) {
                        is_fstat_visible = 1;
                        break;
                    } else {
                        continue;
                    }
                }

                if (fstat.isDirectory() == false
                        && xName.matches("(_|\\.).*|.*\\.(jar|json|txt|csv|tsv|zip|gz|lzo)") == false) {
                    is_fstat_visible = 1;
                    break;
                }

                // if fstat is a Directory
                if (fstat.isDirectory() == true && xName.matches("(_|\\.).*") == false) {
                    is_fstat_visible = 1;
                    break;
                }
            }
            // logger.info(fstat.getPath() + "is_fstat_visible : " + is_fstat_visible);
            if (is_fstat_visible == 0) {
                return;
            }
        }
    } catch (AccessControlException e) {
        logger.error("* TblInfo() Cannot access " + fstat.getPath().toUri().getPath());
        return;
    }

    // get schema and sample data
    DatasetJsonRecord datasetSchemaRecord = fileAnalyzerFactory.getSchema(fstat.getPath(),
            path.toUri().getPath());
    if (datasetSchemaRecord != null) {
        schemaFileWriter.append(datasetSchemaRecord);
    } else {
        logger.error("* Cannot resolve the schema of " + fullPath);
    }

    SampleDataRecord sampleDataRecord = fileAnalyzerFactory.getSampleData(fstat.getPath(),
            path.toUri().getPath());
    if (sampleDataRecord != null) {
        sampleFileWriter.append(sampleDataRecord);
    } else {
        System.err.println("* Cannot fetch sample data of " + fullPath);
    }
}

From source file:wikiduper.clir.rp.TextDocnoMappingBuilder.java

License:Apache License

@Override
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    System.out.println("WTF WTF args: " + Arrays.toString(args));
    if (options == null) {
        return -1;
    }//from w  w  w  .jav a  2  s .  c om

    // Temp directory.
    String tmpDir = "tmp-" + TextDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool name: " + TextDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = Job.getInstance(getConf());

    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(TextDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    PathFilter filter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    // Note: Gov2 and Wt10g raw collections are organized into sub-directories.
    Path collectionPath = new Path(options.collection);
    for (FileStatus status : fs.listStatus(collectionPath, filter)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TextInputFormat.class); //options.inputFormat);
    LOG.info("Input format : " + options.inputFormat);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    LOG.info("Here1\n");
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
    fs.delete(new Path(tmpDir), true);

    return 0;
}