List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.apache.pig.builtin.OrcStorage.java
License:Apache License
private static Path getFirstFile(String location, FileSystem fs) throws IOException { String[] locations = getPathStrings(location); Path[] paths = new Path[locations.length]; for (int i = 0; i < paths.length; ++i) { paths[i] = new Path(locations[i]); }//from w ww.j av a2 s. c o m List<FileStatus> statusList = new ArrayList<FileStatus>(); for (int i = 0; i < paths.length; ++i) { FileStatus[] files = fs.globStatus(paths[i]); if (files != null) { for (FileStatus tempf : files) { statusList.add(tempf); } } } FileStatus[] statusArray = (FileStatus[]) statusList.toArray(new FileStatus[statusList.size()]); Path p = Utils.depthFirstSearchForFile(statusArray, fs); return p; }
From source file:org.apache.pig.impl.io.FileLocalizer.java
License:Apache License
/** * Copies the files from remote to local filesystem. * When 'multipleFiles' is set the path could point to multiple files * through globs or a directory. In this case, return array contains multiple * files, otherwise a single file is returned. * * If pig.jars.relative.to.dfs is true then a relative path is assumed to be * relative to the default filesystem's active directory. * Else they are assumed to be relative to the local working directory. * * @param properties/*from w ww .ja va2 s. c o m*/ * @param filePath * @param multipleFiles * @return */ private static FetchFileRet[] fetchFilesInternal(Properties properties, String filePath, boolean multipleFiles) throws IOException { Path path = new Path(filePath); URI uri = path.toUri(); Configuration conf = new Configuration(); ConfigurationUtil.mergeConf(conf, ConfigurationUtil.toConfiguration(properties)); // if there is no schema or if the schema is "local", then it is // expected to be a local path. FileSystem localFs = FileSystem.getLocal(conf); FileSystem srcFs; if ((!"true".equals(properties.getProperty("pig.jars.relative.to.dfs")) && uri.getScheme() == null) || // For Windows local files (uri.getScheme() == null && uri.getPath().matches("^/[A-Za-z]:.*")) || uri.getScheme().equals("local")) { srcFs = localFs; } else { srcFs = path.getFileSystem(conf); } FileStatus[] files; if (multipleFiles) { files = srcFs.globStatus(path); } else { files = new FileStatus[] { srcFs.getFileStatus(path) }; } if (files == null || files.length == 0) { throw new ExecException("file '" + filePath + "' does not exist.", 101, PigException.INPUT); } FetchFileRet[] fetchFiles = new FetchFileRet[files.length]; int idx = 0; for (FileStatus file : files) { // should throw an exception if this is not a file? String pathname = file.getPath().toUri().getPath(); String filename = file.getPath().getName(); if (srcFs == localFs) { fetchFiles[idx++] = new FetchFileRet(new File(pathname), false); } else { // fetch from remote: File dest = new File(localTempDir, filename); dest.deleteOnExit(); try { srcFs.copyToLocalFile(file.getPath(), new Path(dest.getAbsolutePath())); } catch (IOException e) { throw new ExecException("Could not copy " + filePath + " to local destination " + dest, 101, PigException.INPUT, e); } fetchFiles[idx++] = new FetchFileRet(dest, true); } } return fetchFiles; }
From source file:org.apache.pig.piggybank.test.storage.TestXMLLoader.java
License:Apache License
/** * This test case test the special case when a non-matching tag spans two file * splits in a .bz2 compressed file. At the same time, the part that falls in * the first split is a prefix of the matching tag. * In other words, till the end of the first split, it looks like the tag is * matching but it is not actually matching. * * @throws Exception//from w w w .j a va 2 s . co m */ public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception { Configuration conf = new Configuration(); long blockSize = 100 * 1024; conf.setLong("fs.local.block.size", blockSize); String tagName = "event"; PigServer pig = new PigServer(LOCAL, conf); FileSystem localFs = FileSystem.getLocal(conf); FileStatus[] testFiles = localFs .globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2")); assertTrue("No test files", testFiles.length > 0); for (FileStatus testFile : testFiles) { String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\"); String query = "A = LOAD '" + testFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);"; pig.registerQuery(query); Iterator<?> it = pig.openIterator("A"); while (it.hasNext()) { Tuple tuple = (Tuple) it.next(); if (tuple == null) break; else { if (tuple.size() > 0) { assertTrue(((String) tuple.get(0)).startsWith("<" + tagName + ">")); } } } } }
From source file:org.apache.pig.test.PigStorageWithStatistics.java
License:Apache License
private Long getInputSizeInBytes() throws IOException { if (loc == null) { return 0L; }//w ww . j a va 2 s . c om long inputBytes = 0L; for (String location : getPathStrings(loc)) { Path path = new Path(location); FileSystem fs = path.getFileSystem(new Configuration()); FileStatus[] status = fs.globStatus(path); if (status != null) { for (FileStatus s : status) { inputBytes += MapRedUtil.getPathLength(fs, s); } } } return inputBytes; }
From source file:org.apache.ranger.authorization.hive.authorizer.RangerHiveAuthorizer.java
License:Apache License
private boolean isURIAccessAllowed(String userName, FsAction action, String uri, HiveConf conf) { boolean ret = false; if (action == FsAction.NONE) { ret = true;/*from w w w . ja v a2 s . c om*/ } else { try { Path filePath = new Path(uri); FileSystem fs = FileSystem.get(filePath.toUri(), conf); FileStatus[] filestat = fs.globStatus(filePath); if (filestat != null && filestat.length > 0) { ret = true; for (FileStatus file : filestat) { ret = FileUtils.isOwnerOfFileHierarchy(fs, file, userName) || FileUtils.isActionPermittedForFileHierarchy(fs, file, userName, action); if (!ret) { break; } } } else { // if given path does not exist then check for parent FileStatus file = FileUtils.getPathOrParentThatExists(fs, filePath); FileUtils.checkFileAccessWithImpersonation(fs, file, action, userName); ret = true; } } catch (Exception excp) { LOG.error("Error getting permissions for " + uri, excp); } } return ret; }
From source file:org.apache.sqoop.mapreduce.odps.HdfsOdpsImportJob.java
License:Apache License
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri) throws IOException { ArrayList<FileStatus> files = new ArrayList<FileStatus>(); FileStatus[] dirs;/*from www. ja va 2 s.c o m*/ dirs = fs.globStatus(fs.makeQualified(getInputPath())); for (int i = 0; (dirs != null && i < dirs.length); i++) { files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER))); // We only check one file, so exit the loop when we have at least // one. if (files.size() > 0) { break; } } ParquetMetadata parquetMetadata; try { parquetMetadata = ParquetFileReader.readFooter(job.getConfiguration(), fs.makeQualified(files.get(0).getPath())); } catch (IOException e) { LOG.error("Wrong file format. Please check the export file's format.", e); throw e; } MessageType schema = parquetMetadata.getFileMetaData().getSchema(); Schema avroSchema = new AvroSchemaConverter().convert(schema); DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET) .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build(); return descriptor; }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java
License:Apache License
public static void decompressPath(final FileSystem fs, final String in, final String out, final String compressedFileSuffix, final boolean deletePrevious) throws IOException { final Path inPath = new Path(in); if (fs.isFile(inPath)) HDFSTools.decompressFile(fs, in, out, deletePrevious); else {//from ww w . ja v a 2 s .co m final Path outPath = new Path(out); if (!fs.exists(outPath)) fs.mkdirs(outPath); for (final Path path : FileUtil.stat2Paths(fs.globStatus(new Path(in + FORWARD_ASTERISK)))) { if (path.getName().endsWith(compressedFileSuffix)) HDFSTools.decompressFile(fs, path.toString(), outPath.toString() + FORWARD_SLASH + path.getName().split("\\.")[0], deletePrevious); } } }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java
License:Apache License
public static boolean globDelete(final FileSystem fs, final String path, final boolean recursive) throws IOException { boolean deleted = false; for (final Path p : FileUtil.stat2Paths(fs.globStatus(new Path(path)))) { fs.delete(p, recursive);/* ww w . j a va 2 s .co m*/ deleted = true; } return deleted; }
From source file:org.archive.hadoop.jobs.CDXGenerator.java
License:Apache License
/** * Run the job./* w w w .j a va 2s .c om*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("CDX Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating CDXs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CDXGeneratorMapper.class); job.setJarByClass(CDXGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to CDXGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java
License:Apache License
/** * Run the job.//w w w .j a va2 s. c o m */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WARCMetadataRecord Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WARCMetadataRecordGeneratorMapper.class); job.setJarByClass(WARCMetadataRecordGenerator.class); //extract outlinks by default job.set("outputType", "outlinks"); int arg = 0; if (args[arg].equals("-hopinfo")) { job.set("outputType", "hopinfo"); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WARCMetadataRecordGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }