Example usage for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.pig.builtin.OrcStorage.java

License:Apache License

private static Path getFirstFile(String location, FileSystem fs) throws IOException {
    String[] locations = getPathStrings(location);
    Path[] paths = new Path[locations.length];
    for (int i = 0; i < paths.length; ++i) {
        paths[i] = new Path(locations[i]);
    }//from w ww.j  av  a2  s. c  o m
    List<FileStatus> statusList = new ArrayList<FileStatus>();
    for (int i = 0; i < paths.length; ++i) {
        FileStatus[] files = fs.globStatus(paths[i]);
        if (files != null) {
            for (FileStatus tempf : files) {
                statusList.add(tempf);
            }
        }
    }
    FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]);
    Path p = Utils.depthFirstSearchForFile(statusArray, fs);
    return p;
}

From source file:org.apache.pig.impl.io.FileLocalizer.java

License:Apache License

/**
 * Copies the files from remote to local filesystem.
 * When 'multipleFiles' is set the path could point to multiple files
 * through globs or a directory. In this case, return array contains multiple
 * files, otherwise a single file is returned.
 *
 * If pig.jars.relative.to.dfs is true then a relative path is assumed to be
 * relative to the default filesystem's active directory.
 * Else they are assumed to be relative to the local working directory.
 *
 * @param properties/*from w  ww  .ja  va2  s.  c o m*/
 * @param filePath
 * @param multipleFiles
 * @return
 */
private static FetchFileRet[] fetchFilesInternal(Properties properties, String filePath, boolean multipleFiles)
        throws IOException {

    Path path = new Path(filePath);
    URI uri = path.toUri();
    Configuration conf = new Configuration();
    ConfigurationUtil.mergeConf(conf, ConfigurationUtil.toConfiguration(properties));

    // if there is no schema or if the schema is "local", then it is
    // expected to be a local path.

    FileSystem localFs = FileSystem.getLocal(conf);
    FileSystem srcFs;
    if ((!"true".equals(properties.getProperty("pig.jars.relative.to.dfs")) && uri.getScheme() == null) ||
    // For Windows local files
            (uri.getScheme() == null && uri.getPath().matches("^/[A-Za-z]:.*"))
            || uri.getScheme().equals("local")) {
        srcFs = localFs;
    } else {
        srcFs = path.getFileSystem(conf);
    }

    FileStatus[] files;

    if (multipleFiles) {
        files = srcFs.globStatus(path);
    } else {
        files = new FileStatus[] { srcFs.getFileStatus(path) };
    }
    if (files == null || files.length == 0) {
        throw new ExecException("file '" + filePath + "' does not exist.", 101, PigException.INPUT);
    }

    FetchFileRet[] fetchFiles = new FetchFileRet[files.length];
    int idx = 0;

    for (FileStatus file : files) {
        // should throw an exception if this is not a file?

        String pathname = file.getPath().toUri().getPath();
        String filename = file.getPath().getName();

        if (srcFs == localFs) {
            fetchFiles[idx++] = new FetchFileRet(new File(pathname), false);
        } else {
            // fetch from remote:
            File dest = new File(localTempDir, filename);
            dest.deleteOnExit();
            try {
                srcFs.copyToLocalFile(file.getPath(), new Path(dest.getAbsolutePath()));
            } catch (IOException e) {
                throw new ExecException("Could not copy " + filePath + " to local destination " + dest, 101,
                        PigException.INPUT, e);
            }
            fetchFiles[idx++] = new FetchFileRet(dest, true);
        }
    }

    return fetchFiles;
}

From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java

License:Apache License

/**
 * This test case test the special case when a non-matching tag spans two file
 * splits in a .bz2 compressed file. At the same time, the part that falls in
 * the first split is a prefix of the matching tag.
 * In other words, till the end of the first split, it looks like the tag is
 * matching but it is not actually matching.
 *
 * @throws Exception//from  w  w w  .j  a va 2 s . co  m
 */
public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception {
    Configuration conf = new Configuration();
    long blockSize = 100 * 1024;
    conf.setLong("fs.local.block.size", blockSize);

    String tagName = "event";

    PigServer pig = new PigServer(LOCAL, conf);
    FileSystem localFs = FileSystem.getLocal(conf);
    FileStatus[] testFiles = localFs
            .globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2"));
    assertTrue("No test files", testFiles.length > 0);
    for (FileStatus testFile : testFiles) {
        String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\");
        String query = "A = LOAD '" + testFileName
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    assertTrue(((String) tuple.get(0)).startsWith("<" + tagName + ">"));
                }
            }
        }
    }
}

From source file:org.apache.pig.test.PigStorageWithStatistics.java

License:Apache License

private Long getInputSizeInBytes() throws IOException {
    if (loc == null) {
        return 0L;
    }//w  ww . j a va 2  s  . c om

    long inputBytes = 0L;
    for (String location : getPathStrings(loc)) {
        Path path = new Path(location);
        FileSystem fs = path.getFileSystem(new Configuration());
        FileStatus[] status = fs.globStatus(path);
        if (status != null) {
            for (FileStatus s : status) {
                inputBytes += MapRedUtil.getPathLength(fs, s);
            }
        }
    }
    return inputBytes;
}

From source file:org.apache.ranger.authorization.hive.authorizer.RangerHiveAuthorizer.java

License:Apache License

private boolean isURIAccessAllowed(String userName, FsAction action, String uri, HiveConf conf) {
    boolean ret = false;

    if (action == FsAction.NONE) {
        ret = true;/*from  w  w w  . ja  v  a2 s .  c om*/
    } else {
        try {
            Path filePath = new Path(uri);
            FileSystem fs = FileSystem.get(filePath.toUri(), conf);
            FileStatus[] filestat = fs.globStatus(filePath);

            if (filestat != null && filestat.length > 0) {
                ret = true;

                for (FileStatus file : filestat) {
                    ret = FileUtils.isOwnerOfFileHierarchy(fs, file, userName)
                            || FileUtils.isActionPermittedForFileHierarchy(fs, file, userName, action);

                    if (!ret) {
                        break;
                    }
                }
            } else { // if given path does not exist then check for parent
                FileStatus file = FileUtils.getPathOrParentThatExists(fs, filePath);

                FileUtils.checkFileAccessWithImpersonation(fs, file, action, userName);
                ret = true;
            }
        } catch (Exception excp) {
            LOG.error("Error getting permissions for " + uri, excp);
        }
    }

    return ret;
}

From source file:org.apache.sqoop.mapreduce.odps.HdfsOdpsImportJob.java

License:Apache License

private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
        throws IOException {

    ArrayList<FileStatus> files = new ArrayList<FileStatus>();
    FileStatus[] dirs;/*from   www. ja va 2 s.c  o m*/
    dirs = fs.globStatus(fs.makeQualified(getInputPath()));
    for (int i = 0; (dirs != null && i < dirs.length); i++) {
        files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
        // We only check one file, so exit the loop when we have at least
        // one.
        if (files.size() > 0) {
            break;
        }
    }

    ParquetMetadata parquetMetadata;
    try {
        parquetMetadata = ParquetFileReader.readFooter(job.getConfiguration(),
                fs.makeQualified(files.get(0).getPath()));
    } catch (IOException e) {
        LOG.error("Wrong file format. Please check the export file's format.", e);
        throw e;
    }
    MessageType schema = parquetMetadata.getFileMetaData().getSchema();
    Schema avroSchema = new AvroSchemaConverter().convert(schema);
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
            .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
    return descriptor;
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static void decompressPath(final FileSystem fs, final String in, final String out,
        final String compressedFileSuffix, final boolean deletePrevious) throws IOException {
    final Path inPath = new Path(in);

    if (fs.isFile(inPath))
        HDFSTools.decompressFile(fs, in, out, deletePrevious);
    else {//from ww  w  . ja  v  a 2 s .co  m
        final Path outPath = new Path(out);
        if (!fs.exists(outPath))
            fs.mkdirs(outPath);
        for (final Path path : FileUtil.stat2Paths(fs.globStatus(new Path(in + FORWARD_ASTERISK)))) {
            if (path.getName().endsWith(compressedFileSuffix))
                HDFSTools.decompressFile(fs, path.toString(),
                        outPath.toString() + FORWARD_SLASH + path.getName().split("\\.")[0], deletePrevious);
        }
    }
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static boolean globDelete(final FileSystem fs, final String path, final boolean recursive)
        throws IOException {
    boolean deleted = false;
    for (final Path p : FileUtil.stat2Paths(fs.globStatus(new Path(path)))) {
        fs.delete(p, recursive);/*  ww  w . j  a  va 2 s  .co  m*/
        deleted = true;
    }
    return deleted;
}

From source file:org.archive.hadoop.jobs.CDXGenerator.java

License:Apache License

/**
* Run the job./* w w w .j a va 2s  .c  om*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("CDX Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating CDXs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(CDXGeneratorMapper.class);
    job.setJarByClass(CDXGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to CDXGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java

License:Apache License

/**
* Run the job.//w  w w  .j a  va2 s.  c o m
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WARCMetadataRecord Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WARCMetadataRecordGeneratorMapper.class);
    job.setJarByClass(WARCMetadataRecordGenerator.class);

    //extract outlinks by default
    job.set("outputType", "outlinks");
    int arg = 0;
    if (args[arg].equals("-hopinfo")) {
        job.set("outputType", "hopinfo");
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WARCMetadataRecordGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}