List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:com.linkedin.cubert.utils.CommonUtils.java
License:Open Source License
public static Path getAFileInPath(Configuration conf, Path path, String suffix) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.getFileStatus(path).isDir()) { Path globPath = new Path(path, "*." + suffix); FileStatus[] allFiles = fs.globStatus(globPath); if (allFiles.length == 0) { throw new IOException("there are no files in " + path.toString()); }//from ww w. jav a 2 s . co m path = allFiles[0].getPath(); } print.f("Obtaining schema of %s file %s", suffix, path.toString()); return path; }
From source file:com.linkedin.cubert.utils.FileSystemUtils.java
License:Open Source License
public static List<Path> getGlobPaths(FileSystem fs, Path path) throws IOException { List<Path> paths = new ArrayList<Path>(); FileStatus[] fileStatus = fs.globStatus(path); if (fileStatus == null) throw new IOException("Cannot determine paths at " + path.toString()); for (FileStatus status : fileStatus) { paths.add(status.getPath());/* www.ja v a2 s . co m*/ } return paths; }
From source file:com.linkedin.cubert.utils.FileSystemUtils.java
License:Open Source License
public static Path getLatestPath(FileSystem fs, Path path) throws IOException { String pathStr = path.toString(); // Return the same path, if there is no "#LATEST" within it if (!pathStr.contains("#LATEST")) return path; // replace all #LATEST with glob "*" pathStr = pathStr.replaceAll("#LATEST", "*"); FileStatus[] fileStatus = fs.globStatus(new Path(pathStr)); if (fileStatus == null || fileStatus.length == 0) throw new IOException("Cannot determine paths at " + pathStr); String latestPath = null;/*from www . j av a 2 s .c om*/ for (FileStatus status : fileStatus) { String thisPath = status.getPath().toString(); if (latestPath == null || thisPath.compareTo(latestPath) > 0) latestPath = thisPath; } return new Path(latestPath); }
From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java
License:Apache License
public void run() throws Exception { LOGGER.info("Starting {}", getClass().getSimpleName()); FileSystem fs = FileSystem.get(getConf()); Path inputPathPattern = new Path(_inputSegmentDir); if (fs.exists(new Path(_stagingDir))) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(new Path(_stagingDir), true); }/*from w ww . ja v a 2 s . co m*/ fs.mkdirs(new Path(_stagingDir)); fs.mkdirs(new Path(_stagingDir + "/input/")); if (fs.exists(new Path(_outputDir))) { LOGGER.warn("Found the output folder, deleting it"); fs.delete(new Path(_outputDir), true); } fs.mkdirs(new Path(_outputDir)); List<FileStatus> inputDataFiles = new ArrayList<FileStatus>(); FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern); for (FileStatus fileStatus : fileStatusArr) { inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath())); } for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((_stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationJob.class); job.setJobName(_jobName); job.setMapperClass(HadoopSegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema)); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : _properties.keySet()) { job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString())); } if (_depsJarPath != null && _depsJarPath.length() > 0) { addDepsJarToDistributedCache(new Path(_depsJarPath), job); } // Submit the job for execution. job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", _stagingDir); fs.delete(new Path(_stagingDir), true); }
From source file:com.linkedin.pinot.hadoop.job.SegmentTarPushJob.java
License:Apache License
public void run() throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(_segmentPath); FileStatus[] fileStatusArr = fs.globStatus(path); for (FileStatus fileStatus : fileStatusArr) { if (fileStatus.isDirectory()) { pushDir(fs, fileStatus.getPath()); } else {// www . j a v a2 s . co m pushOneTarFile(fs, fileStatus.getPath()); } } }
From source file:com.linkedin.thirdeye.hadoop.push.SegmentPushPhase.java
License:Apache License
public void run() throws Exception { Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(configuration); String segmentPath = getAndSetConfiguration(configuration, SEGMENT_PUSH_INPUT_PATH); LOGGER.info("Segment path : {}", segmentPath); hosts = getAndSetConfiguration(configuration, SEGMENT_PUSH_CONTROLLER_HOSTS) .split(ThirdEyeConstants.FIELD_SEPARATOR); port = getAndSetConfiguration(configuration, SEGMENT_PUSH_CONTROLLER_PORT); tablename = getAndCheck(ThirdEyeConfigProperties.THIRDEYE_TABLE_NAME.toString()); Path path = new Path(segmentPath); FileStatus[] fileStatusArr = fs.globStatus(path); for (FileStatus fileStatus : fileStatusArr) { if (fileStatus.isDirectory()) { pushDir(fs, fileStatus.getPath()); } else {/*w w w . jav a 2s . c o m*/ pushOneTarFile(fs, fileStatus.getPath()); } } if (uploadSuccess && segmentName != null) { segmentPushControllerAPIs = new SegmentPushControllerAPIs(hosts, port); LOGGER.info("Deleting segments overlapping to {} from table {} ", segmentName, tablename); segmentPushControllerAPIs.deleteOverlappingSegments(tablename, segmentName); } }
From source file:com.linkedin.whiteelephant.util.JobStatsProcessing.java
License:Apache License
public static List<ProcessingTask> getTasks(FileSystem fs, String logsRoot, String clusterName, String outputPathRoot, boolean incremental, int numDays, int numDaysForced) throws IOException { Calendar cal = Calendar.getInstance(timeZone); SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy"); SimpleDateFormat dayFormat = new SimpleDateFormat("MMdd"); SimpleDateFormat idFormat = new SimpleDateFormat("yyyy-MM-dd"); yearFormat.setTimeZone(timeZone);// www.j a v a 2 s .co m dayFormat.setTimeZone(timeZone); idFormat.setTimeZone(timeZone); List<ProcessingTask> processingTasks = new ArrayList<ProcessingTask>(); numDays = Math.max(numDays, numDaysForced); // Start processing previous day of data since current day isn't yet finished. Unless we are aggregating hourly data there is no point. cal.add(Calendar.DAY_OF_MONTH, -1); int numPaths = 0; long totalLength = 0; for (int i = 0; i < numDays; i++, cal.add(Calendar.DAY_OF_MONTH, -1)) { Date date = cal.getTime(); String pathFormat = String.format("%s/%s/daily/*/%s/%s/*.log", logsRoot, clusterName, yearFormat.format(date), dayFormat.format(date)); FileStatus[] stats = fs.globStatus(new Path(pathFormat)); StringBuilder msg = new StringBuilder(pathFormat + " => " + stats.length + " files"); String outputPathForDay = String.format("%s/%s/%s/%s", outputPathRoot, clusterName, yearFormat.format(date), dayFormat.format(date)); if (stats.length > 0) { if (!incremental || !fs.exists(new Path(outputPathForDay)) || i < numDaysForced) { for (FileStatus stat : stats) { totalLength += stat.getLen(); numPaths++; } String id = clusterName + "-" + idFormat.format(date); System.out.println(msg); processingTasks.add(new ProcessingTask(id, pathFormat, outputPathForDay, totalLength)); } else if (incremental && fs.exists(new Path(outputPathForDay))) { msg.append(" (skipping)"); System.out.println(msg); } } } System.out.println("Found " + numPaths + " paths to process, totalling " + totalLength + " bytes (" + (totalLength / (1024 * 1024 * 1024)) + " gigabytes)"); return processingTasks; }
From source file:com.moz.fiji.mapreduce.kvstore.lib.FileStoreHelper.java
License:Apache License
/** * Deserializes file- and DistributedCache-specific properties associated * with the KeyValueStore that owns this FileStoreHelper from the specified configuration. * * <p>This retains a reference to the KeyValueStoreConfiguration's backing Configuration * instance to use when opening files specified by this configuration.</p> * * @param conf the configuration to read. * @throws IOException if there's an error deserializing the configuration. *///from w ww . j a v a2s .c o m public void initFromConf(KeyValueStoreConfiguration conf) throws IOException { setConf(conf.getDelegate()); mDCachePrefix = conf.get(CONF_DCACHE_PREFIX_KEY, ""); LOG.debug("Input dCachePrefix: " + mDCachePrefix); if (mDCachePrefix.isEmpty()) { // Read an ordinary list of files from the Configuration. // These may include directories and globs to expand. mInputPaths = Lists.map(Arrays.asList(conf.getStrings(CONF_PATHS_KEY, new String[0])), new Lists.Func<String, Path>() { @Override public Path eval(String in) { LOG.debug("File input: " + in); return new Path(in); } }); } else { // Use the dcache prefix to get the names of the files for this store. // The symlinks are already present in the working dir of the task. final FileSystem localFs = FileSystem.getLocal(conf.getDelegate()); FileStatus[] statuses = localFs.globStatus(new Path(mDCachePrefix + "-*")); if (null == statuses || statuses.length == 0) { throw new IOException("No files associated with the job in the DistributedCache"); } // Get the (absolute) input file paths to use. mInputPaths = Lists.map(Arrays.asList(statuses), new Lists.Func<FileStatus, Path>() { @Override public Path eval(FileStatus status) { Path out = status.getPath().makeQualified(localFs); LOG.debug("Loaded from DistributedCache: " + out); return out; } }); } // If we are initializing a client-side instance to later serialize, the user may have // specified HDFS files, but also an intent to put the files in the DistributedCache. Set // this flag now, which will generate mDCachePrefix when addToConfiguration() is called // later. mUseDCache = conf.getBoolean(CONF_USE_DCACHE_KEY, USE_DCACHE_DEFAULT); }
From source file:com.revolutionanalytics.hadoop.hdfs.DelayedExceptionThrowing.java
License:Apache License
final void globAndProcess(Path srcPattern, FileSystem srcFs) throws IOException { ArrayList<IOException> exceptions = new ArrayList<IOException>(); for (Path p : FileUtil.stat2Paths(srcFs.globStatus(srcPattern), srcPattern)) try {/*from w w w . j a va2 s. co m*/ process(p, srcFs); } catch (IOException ioe) { exceptions.add(ioe); } if (!exceptions.isEmpty()) if (exceptions.size() == 1) throw exceptions.get(0); else throw new IOException("Multiple IOExceptions: " + exceptions); }
From source file:com.revolutionanalytics.hadoop.hdfs.FileUtils.java
License:Apache License
private static void ls__(FileSystem srcFS, String path, ArrayList<String> lsco, boolean dorecurse) throws IOException, FileNotFoundException { Path spath = new Path(path); FileStatus[] srcs;/*from ww w . ja v a2 s . c o m*/ srcs = srcFS.globStatus(spath); if (srcs == null || srcs.length == 0) { throw new FileNotFoundException("Cannot access " + path + ": No such file or directory."); } if (srcs.length == 1 && srcs[0].isDir()) srcs = srcFS.listStatus(srcs[0].getPath()); Calendar c = Calendar.getInstance(); for (FileStatus status : srcs) { StringBuilder sb = new StringBuilder(); boolean idir = status.isDir(); String x = idir ? "d" : "-"; if (dorecurse && idir) ls__(srcFS, status.getPath().toUri().getPath(), lsco, dorecurse); else { sb.append(x); sb.append(status.getPermission().toString()); sb.append(fsep); sb.append(status.getOwner()); sb.append(fsep); sb.append(status.getGroup()); sb.append(fsep); sb.append(status.getLen()); sb.append(fsep); Date d = new Date(status.getModificationTime()); sb.append(formatter.format(d)); sb.append(fsep); sb.append(status.getPath().toUri().getPath()); lsco.add(sb.toString()); } } }