Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:org.apache.pig.builtin.OrcStorage.java

License:Apache License

private static Path getFirstFile(String location, FileSystem fs) throws IOException {
    String[] locations = getPathStrings(location);
    Path[] paths = new Path[locations.length];
    for (int i = 0; i < paths.length; ++i) {
        paths[i] = new Path(locations[i]);
    }//from w ww.j  av  a2  s. c  o m
    List<FileStatus> statusList = new ArrayList<FileStatus>();
    for (int i = 0; i < paths.length; ++i) {
        FileStatus[] files = fs.globStatus(paths[i]);
        if (files != null) {
            for (FileStatus tempf : files) {
                statusList.add(tempf);
            }
        }
    }
    FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]);
    Path p = Utils.depthFirstSearchForFile(statusArray, fs);
    return p;
}

From source file:org.apache.pig.impl.io.FileLocalizer.java

License:Apache License

/**
 * Copies the files from remote to local filesystem.
 * When 'multipleFiles' is set the path could point to multiple files
 * through globs or a directory. In this case, return array contains multiple
 * files, otherwise a single file is returned.
 *
 * If pig.jars.relative.to.dfs is true then a relative path is assumed to be
 * relative to the default filesystem's active directory.
 * Else they are assumed to be relative to the local working directory.
 *
 * @param properties/*from w  ww  .ja  va2  s.  c o m*/
 * @param filePath
 * @param multipleFiles
 * @return
 */
private static FetchFileRet[] fetchFilesInternal(Properties properties, String filePath, boolean multipleFiles)
        throws IOException {

    Path path = new Path(filePath);
    URI uri = path.toUri();
    Configuration conf = new Configuration();
    ConfigurationUtil.mergeConf(conf, ConfigurationUtil.toConfiguration(properties));

    // if there is no schema or if the schema is "local", then it is
    // expected to be a local path.

    FileSystem localFs = FileSystem.getLocal(conf);
    FileSystem srcFs;
    if ((!"true".equals(properties.getProperty("pig.jars.relative.to.dfs")) && uri.getScheme() == null) ||
    // For Windows local files
            (uri.getScheme() == null && uri.getPath().matches("^/[A-Za-z]:.*"))
            || uri.getScheme().equals("local")) {
        srcFs = localFs;
    } else {
        srcFs = path.getFileSystem(conf);
    }

    FileStatus[] files;

    if (multipleFiles) {
        files = srcFs.globStatus(path);
    } else {
        files = new FileStatus[] { srcFs.getFileStatus(path) };
    }
    if (files == null || files.length == 0) {
        throw new ExecException("file '" + filePath + "' does not exist.", 101, PigException.INPUT);
    }

    FetchFileRet[] fetchFiles = new FetchFileRet[files.length];
    int idx = 0;

    for (FileStatus file : files) {
        // should throw an exception if this is not a file?

        String pathname = file.getPath().toUri().getPath();
        String filename = file.getPath().getName();

        if (srcFs == localFs) {
            fetchFiles[idx++] = new FetchFileRet(new File(pathname), false);
        } else {
            // fetch from remote:
            File dest = new File(localTempDir, filename);
            dest.deleteOnExit();
            try {
                srcFs.copyToLocalFile(file.getPath(), new Path(dest.getAbsolutePath()));
            } catch (IOException e) {
                throw new ExecException("Could not copy " + filePath + " to local destination " + dest, 101,
                        PigException.INPUT, e);
            }
            fetchFiles[idx++] = new FetchFileRet(dest, true);
        }
    }

    return fetchFiles;
}

From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java

License:Apache License

/**
 * This test case test the special case when a non-matching tag spans two file
 * splits in a .bz2 compressed file. At the same time, the part that falls in
 * the first split is a prefix of the matching tag.
 * In other words, till the end of the first split, it looks like the tag is
 * matching but it is not actually matching.
 *
 * @throws Exception//from  w  w w  .j  a va 2 s . co  m
 */
public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception {
    Configuration conf = new Configuration();
    long blockSize = 100 * 1024;
    conf.setLong("fs.local.block.size", blockSize);

    String tagName = "event";

    PigServer pig = new PigServer(LOCAL, conf);
    FileSystem localFs = FileSystem.getLocal(conf);
    FileStatus[] testFiles = localFs
            .globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2"));
    assertTrue("No test files", testFiles.length > 0);
    for (FileStatus testFile : testFiles) {
        String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\");
        String query = "A = LOAD '" + testFileName
                + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
        pig.registerQuery(query);
        Iterator<?> it = pig.openIterator("A");
        while (it.hasNext()) {
            Tuple tuple = (Tuple) it.next();
            if (tuple == null)
                break;
            else {
                if (tuple.size() > 0) {
                    assertTrue(((String) tuple.get(0)).startsWith("<" + tagName + ">"));
                }
            }
        }
    }
}

From source file:org.apache.pig.test.PigStorageWithStatistics.java

License:Apache License

private Long getInputSizeInBytes() throws IOException {
    if (loc == null) {
        return 0L;
    }//w  ww . j a va 2  s  . c om

    long inputBytes = 0L;
    for (String location : getPathStrings(loc)) {
        Path path = new Path(location);
        FileSystem fs = path.getFileSystem(new Configuration());
        FileStatus[] status = fs.globStatus(path);
        if (status != null) {
            for (FileStatus s : status) {
                inputBytes += MapRedUtil.getPathLength(fs, s);
            }
        }
    }
    return inputBytes;
}

From source file:org.apache.ranger.authorization.hive.authorizer.RangerHiveAuthorizer.java

License:Apache License

private boolean isURIAccessAllowed(String userName, FsAction action, String uri, HiveConf conf) {
    boolean ret = false;

    if (action == FsAction.NONE) {
        ret = true;/*from  w  w w  . ja  v  a2 s .  c om*/
    } else {
        try {
            Path filePath = new Path(uri);
            FileSystem fs = FileSystem.get(filePath.toUri(), conf);
            FileStatus[] filestat = fs.globStatus(filePath);

            if (filestat != null && filestat.length > 0) {
                ret = true;

                for (FileStatus file : filestat) {
                    ret = FileUtils.isOwnerOfFileHierarchy(fs, file, userName)
                            || FileUtils.isActionPermittedForFileHierarchy(fs, file, userName, action);

                    if (!ret) {
                        break;
                    }
                }
            } else { // if given path does not exist then check for parent
                FileStatus file = FileUtils.getPathOrParentThatExists(fs, filePath);

                FileUtils.checkFileAccessWithImpersonation(fs, file, action, userName);
                ret = true;
            }
        } catch (Exception excp) {
            LOG.error("Error getting permissions for " + uri, excp);
        }
    }

    return ret;
}

From source file:org.apache.sqoop.mapreduce.odps.HdfsOdpsImportJob.java

License:Apache License

private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
        throws IOException {

    ArrayList<FileStatus> files = new ArrayList<FileStatus>();
    FileStatus[] dirs;/*from   www. ja va 2 s.c  o m*/
    dirs = fs.globStatus(fs.makeQualified(getInputPath()));
    for (int i = 0; (dirs != null && i < dirs.length); i++) {
        files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
        // We only check one file, so exit the loop when we have at least
        // one.
        if (files.size() > 0) {
            break;
        }
    }

    ParquetMetadata parquetMetadata;
    try {
        parquetMetadata = ParquetFileReader.readFooter(job.getConfiguration(),
                fs.makeQualified(files.get(0).getPath()));
    } catch (IOException e) {
        LOG.error("Wrong file format. Please check the export file's format.", e);
        throw e;
    }
    MessageType schema = parquetMetadata.getFileMetaData().getSchema();
    Schema avroSchema = new AvroSchemaConverter().convert(schema);
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
            .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
    return descriptor;
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static void decompressPath(final FileSystem fs, final String in, final String out,
        final String compressedFileSuffix, final boolean deletePrevious) throws IOException {
    final Path inPath = new Path(in);

    if (fs.isFile(inPath))
        HDFSTools.decompressFile(fs, in, out, deletePrevious);
    else {//from ww  w  . ja  v  a 2 s .co  m
        final Path outPath = new Path(out);
        if (!fs.exists(outPath))
            fs.mkdirs(outPath);
        for (final Path path : FileUtil.stat2Paths(fs.globStatus(new Path(in + FORWARD_ASTERISK)))) {
            if (path.getName().endsWith(compressedFileSuffix))
                HDFSTools.decompressFile(fs, path.toString(),
                        outPath.toString() + FORWARD_SLASH + path.getName().split("\\.")[0], deletePrevious);
        }
    }
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java

License:Apache License

public static boolean globDelete(final FileSystem fs, final String path, final boolean recursive)
        throws IOException {
    boolean deleted = false;
    for (final Path p : FileUtil.stat2Paths(fs.globStatus(new Path(path)))) {
        fs.delete(p, recursive);/*  ww  w . j  a  va 2 s  .co  m*/
        deleted = true;
    }
    return deleted;
}

From source file:org.archive.hadoop.jobs.CDXGenerator.java

License:Apache License

/**
* Run the job./* w w w .j a va 2s  .c  om*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("CDX Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating CDXs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(CDXGeneratorMapper.class);
    job.setJarByClass(CDXGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to CDXGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java

License:Apache License

/**
* Run the job.//w  w w  .j a  va2 s.  c o m
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WARCMetadataRecord Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WARCMetadataRecordGeneratorMapper.class);
    job.setJarByClass(WARCMetadataRecordGenerator.class);

    //extract outlinks by default
    job.set("outputType", "outlinks");
    int arg = 0;
    if (args[arg].equals("-hopinfo")) {
        job.set("outputType", "hopinfo");
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WARCMetadataRecordGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}