Example usage for org.apache.hadoop.fs FileStatus isDirectory

List of usage examples for org.apache.hadoop.fs FileStatus isDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileStatus isDirectory.

Prototype

public boolean isDirectory() 

Source Link

Document

Is this a directory?

Usage

From source file:org.trustedanalytics.cfbroker.store.hdfs.service.SimpleHdfsClient.java

License:Apache License

@Override
public List<String> listFiles(String path, boolean recursive) throws IOException {
    List<String> files = new ArrayList<>();
    FileStatus[] statuses = fs.listStatus(new Path(path));

    for (FileStatus status : statuses) {
        files.add(status.getPath().toString());
        if (status.isDirectory() && recursive)
            files.addAll(listFiles(status.getPath().toString(), recursive));
    }//from www.  j a va2 s .co m

    return files;
}

From source file:org.wikimedia.wikihadoop.StreamWikiDumpInputFormat.java

License:Apache License

/** 
 * Generate the list of files and make them into FileSplits.
 * @param job the job context/*  w  w  w  .  j a  va2  s  . com*/
 * @throws IOException
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    LOG.info("StreamWikiDumpInputFormat.getSplits job=" + job + " n=" + numSplits);
    InputSplit[] oldSplits = super.getSplits(job, numSplits);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    long totalSize = 0; // compute total size
    for (FileStatus file : files) { // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }
    long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1);
    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    for (FileStatus file : files) {
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        long blockSize = file.getBlockSize();
        long splitSize = computeSplitSize(goalSize, minSize, blockSize);
        LOG.info(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize));
        //System.err.println(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize));
        for (InputSplit x : getSplits(job, file, pageBeginPattern, splitSize))
            splits.add(x);
    }
    System.err.println("splits=" + splits);
    return splits.toArray(new InputSplit[splits.size()]);
}

From source file:stewi.mapred.LenientSequenceFileInputFormat.java

License:Apache License

@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
    FileStatus[] files = super.listStatus(job);
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];
        if (file.isDirectory()) { // it's a MapFile
            Path dataFile = new Path(file.getPath(), MapFile.DATA_FILE_NAME);
            FileSystem fs = file.getPath().getFileSystem(job);
            // use the data file
            files[i] = fs.getFileStatus(dataFile);
        }/*from   w w w  .j  a v a 2 s  .c  om*/
    }
    return files;
}

From source file:streaming.core.HDFSTarEntry.java

License:Apache License

public HDFSTarEntry(FileStatus hdfsFileStatus, String entryName) {
    super(null, entryName);
    this.hdfsFileStatus = hdfsFileStatus;
    header = TarHeader.createHeader(entryName, hdfsFileStatus.getLen(),
            hdfsFileStatus.getModificationTime() / 1000, hdfsFileStatus.isDirectory());
}

From source file:terasort.io.TeraOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext job) throws IOException {
    // Ensure that the output directory is set
    Path outDir = getOutputPath(job);
    if (outDir == null) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }/*  w  ww.ja  v a 2  s  . co m*/

    final Configuration jobConf = job.getConfiguration();

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, jobConf);

    final FileSystem fs = outDir.getFileSystem(jobConf);

    if (fs.exists(outDir)) {
        // existing output dir is considered empty iff its only content is the
        // partition file.
        //
        final FileStatus[] outDirKids = fs.listStatus(outDir);
        boolean empty = false;
        if (outDirKids != null && outDirKids.length == 1) {
            final FileStatus st = outDirKids[0];
            final String fname = st.getPath().getName();
            empty = !st.isDirectory();
        }
        if (!empty) {
            throw new FileAlreadyExistsException("Output directory " + outDir + " already exists");
        }
    }
}

From source file:wanggang1987.bigdataapi.hadoopapi.HadoopClientAPI.java

/**
 * list all file//w w w  . j a  v a  2 s  . c om
 *
 * @param path
 * @return
 * @throws IllegalArgumentException
 */
public ArrayList<String> listFiles(String path) {
    ArrayList<String> list = null;
    try {
        list = new ArrayList<>();
        FileStatus[] files = hdfs.listStatus(new Path(path));
        for (FileStatus file : files) {
            if (file.isDirectory() == false) {
                list.add(file.getPath().toString());
            }
        }
    } catch (IllegalArgumentException | IOException e) {
        logger.error("listFiles failed", e);
    }
    return list;
}

From source file:wherehows.SchemaFetch.java

License:Open Source License

/**
 * Decide whether this is a dataset by it's sub directories format
 *
 * @param path//  w  w w  .  j  a v  a2  s  . c  o m
 * @return 1 : empty dataset or lowest level dataset
 * 0 : may have sub dataset
 * < 0 : error
 * @throws java.io.IOException
 * @throws AccessControlException
 */
private static int isTable(Path path, FileSystem fs) throws IOException, AccessControlException {
    int hiddenFileCount = 0;
    int datePartitionCount = 0;
    int dataSetCount = 0;
    int fileCount = 0;
    int i = 0;
    String objName;

    try {
        // System.err.println("  Probing " + path.toString());
        for (FileStatus fstat : fs.listStatus(path)) {
            objName = fstat.getPath().getName();

            if (!fstat.isDirectory()) {
                // file
                fileCount++;
            } else if (objName.matches("(_|\\.|tmp|temp|_distcp|backup|\\*|test|trash).*")) {
                // hidden/temporary fs object
                hiddenFileCount++;
            } else if (objName.matches(
                    "daily|hourly|hourly.deduped|monthly|weekly|(ds|dt|datepartition|year|month|date)=[0-9-]+")) {
                // temporal partition type
                datePartitionCount++;
            } else if (objName.matches(
                    "[0-9\\-_]+\\w+[0-9\\-_]+|\\w+_day=[0-9\\-_]+|\\p{Alnum}+=[0-9\\-_]+|[0-9\\-_]+|[0-9]{14}_\\w+|[0-9]{8}_\\w+|[0-9]{4}-[0-9]{2}-[0-9]{2}.*")) {
                // temporal
                datePartitionCount++;
            } else {
                // sub directory
                dataSetCount++;
            }
            i++;
        } // end of for fstat
    } // end of try
    catch (AccessControlException e) {
        return -1; // Usually there is a permission issue
    } catch (IOException e) {
        return -2;
    } catch (Exception e) {
        return -3;
    }

    // System.err.println("  -- isTable(" + path.toString() + ") i=" + i + " datePartition=" + datePartitionCount + " dataSet=" + dataSetCount);
    if (i == 0 || dataSetCount == 0) {
        return 1;
    } else if (i > 0 && datePartitionCount > 0) {
        return 1;
    } else {
        return 0;
    }
}

From source file:wherehows.SchemaFetch.java

License:Open Source License

/**
 * Collect one dataset's metadata//from  w  ww. ja v  a 2s . c o m
 *
 * @param path
 * @throws java.io.IOException
 */
private static void traceTableInfo(Path path, FileSystem tranceFs) throws IOException, SQLException {
    logger.info("trace table : " + path.toUri().getPath());
    // analyze the pattern of the name
    String tbl_name = path.getName();
    if (tbl_name.matches("(_|\\.|tmp|temp|stg|test|\\*).*")) // skip _temporary _schema.avsc
    {
        return;
    }

    FileStatus[] fstat_lst;
    FileStatus fstat = tranceFs.getFileStatus(path);
    String fullPath = path.toUri().getPath();
    String xName = "";
    long data_size = -1;
    long sample_data_size = -1;
    int i, x;
    // String data_source = checkDataSource(fullPath);

    // TODO this part need to rewrite
    try {
        while (fstat.isDirectory()) {

            fstat_lst = tranceFs.listStatus(fstat.getPath()); // list all children
            if (fstat_lst.length == 0) { // empty directory
                logger.info(fstat.getPath().toUri().getPath() + " is empty.");
                return;
            }

            int is_fstat_visible = 0;
            for (i = fstat_lst.length - 1; i >= 0; i--) { // iterate from the last item back to the first
                fstat = fstat_lst[i]; // start from the last file in the list
                xName = fstat.getPath().getName();

                if (xName.matches("\\.pig_schema|.*\\.avsc|\\.dataset")) {
                    is_fstat_visible = 1;
                    break;
                } else if (xName.equals("hourly") && i > 0
                        && fstat_lst[i - 1].getPath().getName().equals("daily")) {
                    continue; // try to traverse "daily" instead of "hourly" when possible
                } else if (xName.matches("(_|\\.|tmp|temp).*")) {
                    continue;
                }

                try { // sub directory may be inaccessible
                    sample_data_size = fstat.isDirectory()
                            ? tranceFs.getContentSummary(fstat.getPath()).getLength()
                            : fstat.getLen();
                } catch (AccessControlException e) {
                    if (tranceFs.listStatus(fstat.getPath()).length > 0) {
                        is_fstat_visible = 1;
                        break;
                    } else {
                        continue;
                    }
                }

                if (fstat.isDirectory() == false
                        && xName.matches("(_|\\.).*|.*\\.(jar|json|txt|csv|tsv|zip|gz|lzo)") == false) {
                    is_fstat_visible = 1;
                    break;
                }

                // if fstat is a Directory
                if (fstat.isDirectory() == true && xName.matches("(_|\\.).*") == false) {
                    is_fstat_visible = 1;
                    break;
                }
            }
            // logger.info(fstat.getPath() + "is_fstat_visible : " + is_fstat_visible);
            if (is_fstat_visible == 0) {
                return;
            }
        }
    } catch (AccessControlException e) {
        logger.error("* TblInfo() Cannot access " + fstat.getPath().toUri().getPath());
        return;
    }

    // get schema and sample data
    DatasetJsonRecord datasetSchemaRecord = fileAnalyzerFactory.getSchema(fstat.getPath(),
            path.toUri().getPath());
    if (datasetSchemaRecord != null) {
        schemaFileWriter.append(datasetSchemaRecord);
    } else {
        logger.error("* Cannot resolve the schema of " + fullPath);
    }

    SampleDataRecord sampleDataRecord = fileAnalyzerFactory.getSampleData(fstat.getPath(),
            path.toUri().getPath());
    if (sampleDataRecord != null) {
        sampleFileWriter.append(sampleDataRecord);
    } else {
        System.err.println("* Cannot fetch sample data of " + fullPath);
    }
}

From source file:wikiduper.clir.rp.TextDocnoMappingBuilder.java

License:Apache License

@Override
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    System.out.println("WTF WTF args: " + Arrays.toString(args));
    if (options == null) {
        return -1;
    }//from w  w  w  .jav a  2  s .  c om

    // Temp directory.
    String tmpDir = "tmp-" + TextDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool name: " + TextDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = Job.getInstance(getConf());

    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(TextDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    PathFilter filter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    // Note: Gov2 and Wt10g raw collections are organized into sub-directories.
    Path collectionPath = new Path(options.collection);
    for (FileStatus status : fs.listStatus(collectionPath, filter)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TextInputFormat.class); //options.inputFormat);
    LOG.info("Input format : " + options.inputFormat);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    LOG.info("Here1\n");
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
    fs.delete(new Path(tmpDir), true);

    return 0;
}