List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:fi.tkk.ics.hadoop.bam.cli.Utils.java
License:Open Source License
/** Merges the files in the given directory that have names given by * getMergeableWorkFile() into out./*ww w . ja va2 s . co m*/ * * Outputs progress reports if commandName is non-null. */ public static void mergeInto(OutputStream out, Path directory, String basePrefix, String basePostfix, Configuration conf, String commandName) throws IOException { final FileSystem fs = directory.getFileSystem(conf); final FileStatus[] parts = fs.globStatus(new Path(directory, basePrefix + conf.get(WORK_FILENAME_PROPERTY) + basePostfix + "-[0-9][0-9][0-9][0-9][0-9][0-9]*")); int i = 0; Timer t = new Timer(); for (final FileStatus part : parts) { if (commandName != null) { System.out.printf("%s :: Merging part %d (size %d)...", commandName, ++i, part.getLen()); System.out.flush(); t.start(); } final InputStream in = fs.open(part.getPath()); IOUtils.copyBytes(in, out, conf, false); in.close(); if (commandName != null) System.out.printf(" done in %d.%03d s.\n", t.stopS(), t.fms()); } for (final FileStatus part : parts) fs.delete(part.getPath(), false); }
From source file:FormatStorage1.IColumnDataFile.java
License:Open Source License
public void open(String fileName, ArrayList<Integer> idxs) throws IOException { this.columnprojects = new ArrayList<ArrayList<Integer>>(); this.head = new IHead(); this.readidxs = new TreeSet<Integer>(); FileSystem fs = FileSystem.get(conf); FileStatus[] statuss = fs.globStatus(new Path(fileName + "_idx*")); this.cp2ifdfs.clear(); this.idx2ifdfs.clear(); if (idxs != null && idxs.size() > 0) { readidxs.addAll(idxs);/* w ww. jav a 2 s .c o m*/ for (int i = 0; i < statuss.length; i++) { String file = statuss[i].getPath().toString(); IFormatDataFile ifdf = null; String idxstr = file.substring(file.lastIndexOf("_idx") + 4); String[] sts = idxstr.split("_"); TreeSet<Integer> ts = new TreeSet<Integer>(); for (int j = 0; j < sts.length; j++) { ts.add(Integer.parseInt(sts[j])); } boolean contains = false; for (Integer id : idxs) { if (ts.contains(id)) { contains = true; if (ifdf == null) { ifdf = new IFormatDataFile(conf); ifdf.open(file); } this.idx2ifdfs.put(id, ifdf); } } if (contains) { ArrayList<Integer> cp = new ArrayList<Integer>(); cp.addAll(ts); this.columnprojects.add(cp); this.cp2ifdfs.put(cp, ifdf); } } } else { for (int i = 0; i < statuss.length; i++) { String file = statuss[i].getPath().toString(); IFormatDataFile ifdf = null; String idxstr = file.substring(file.lastIndexOf("_idx") + 4); String[] sts = idxstr.split("_"); TreeSet<Integer> ts = new TreeSet<Integer>(); ifdf = new IFormatDataFile(conf); ifdf.open(file); for (int j = 0; j < sts.length; j++) { int id = Integer.parseInt(sts[j]); ts.add(id); this.idx2ifdfs.put(id, ifdf); } ArrayList<Integer> cp = new ArrayList<Integer>(); cp.addAll(ts); this.readidxs.addAll(ts); this.columnprojects.add(cp); this.cp2ifdfs.put(cp, ifdf); } } this.fieldtypes = new HashMap<Integer, IRecord.IFType>(); for (Integer idx : this.readidxs) { this.fieldtypes.put(idx, this.idx2ifdfs.get(idx).fileInfo().head().fieldMap().fieldtypes().get(idx)); } workstatus = ConstVar.WS_Read; }
From source file:gobblin.data.management.retention.DatasetVersionFinderTest.java
License:Apache License
@Test public void test() throws IOException { FileSystem fs = mock(FileSystem.class); String datasetPathStr = "/path/to/dataset"; String dataset1 = "datasetVersion1"; String dataset2 = "datasetVersion2"; Path datasetPath = new Path(datasetPathStr); Path globbedPath = new Path(datasetPathStr + "/*"); Path datasetVersion1 = new Path(datasetPathStr + "/" + dataset1); Path datasetVersion2 = new Path(datasetPathStr + "/" + dataset2); when(fs.globStatus(globbedPath)) .thenReturn(new FileStatus[] { new FileStatus(0, true, 0, 0, 0, datasetVersion1), new FileStatus(0, true, 0, 0, 0, datasetVersion2) }); DatasetVersionFinder<StringDatasetVersion> versionFinder = new MockDatasetVersionFinder(fs, new Properties()); List<StringDatasetVersion> datasetVersions = Lists .newArrayList(versionFinder.findDatasetVersions(new MockDataset(datasetPath))); Assert.assertEquals(datasetVersions.size(), 2); Assert.assertEquals(datasetVersions.get(0).getVersion(), dataset1); Assert.assertEquals(datasetVersions.get(0).getPathsToDelete().iterator().next(), datasetVersion1); Assert.assertEquals(datasetVersions.get(1).getVersion(), dataset2); Assert.assertEquals(datasetVersions.get(1).getPathsToDelete().iterator().next(), datasetVersion2); }
From source file:gobblin.runtime.template.PullFileToConfigConverter.java
License:Apache License
public void convert() throws IOException { Config baseConfig = ConfigFactory.parseString(DO_NOT_OVERRIDE_KEY + ": []"); FileSystem pullFileFs = pullFileRootPath.getFileSystem(new Configuration()); FileSystem outputFs = this.outputPath.getFileSystem(new Configuration()); Config sysConfig = ConfigFactory.parseFile(this.sysConfigPath); PullFileLoader pullFileLoader = new PullFileLoader(this.pullFileRootPath, pullFileFs, PullFileLoader.DEFAULT_JAVA_PROPS_PULL_FILE_EXTENSIONS, PullFileLoader.DEFAULT_HOCON_PULL_FILE_EXTENSIONS); PackagedTemplatesJobCatalogDecorator catalog = new PackagedTemplatesJobCatalogDecorator(); ConfigResolveOptions configResolveOptions = ConfigResolveOptions.defaults(); configResolveOptions = configResolveOptions.setAllowUnresolved(true); ResourceBasedJobTemplate template;/* ww w.ja va2s.c om*/ Config templateConfig; try { template = (ResourceBasedJobTemplate) catalog.getTemplate(templateURI.toUri()); templateConfig = sysConfig.withFallback(template.getRawTemplateConfig()).withFallback(baseConfig) .resolve(configResolveOptions); } catch (SpecNotFoundException | JobTemplate.TemplateException exc) { throw new IOException(exc); } Set<String> doNotOverride = templateConfig.hasPath(DO_NOT_OVERRIDE_KEY) ? Sets.newHashSet(templateConfig.getStringList(DO_NOT_OVERRIDE_KEY)) : Sets.<String>newHashSet(); ConfigRenderOptions configRenderOptions = ConfigRenderOptions.defaults(); configRenderOptions = configRenderOptions.setComments(false); configRenderOptions = configRenderOptions.setOriginComments(false); configRenderOptions = configRenderOptions.setFormatted(true); configRenderOptions = configRenderOptions.setJson(false); for (FileStatus pullFile : pullFileFs.globStatus(this.fileGlobToConvert)) { Config pullFileConfig = pullFileLoader.loadPullFile(pullFile.getPath(), ConfigFactory.empty(), true) .resolve(); Map<String, String> outputConfigMap = Maps.newHashMap(); outputConfigMap.put(ConfigurationKeys.JOB_TEMPLATE_PATH, this.templateURI.toString()); boolean somethingChanged; do { somethingChanged = false; Config currentOutputConfig = ConfigFactory.parseMap(outputConfigMap); Config currentResolvedConfig = currentOutputConfig.withFallback(templateConfig) .resolve(configResolveOptions); for (Map.Entry<Object, Object> entry : ConfigUtils.configToProperties(pullFileConfig).entrySet()) { String key = (String) entry.getKey(); String value = (String) entry.getValue(); try { if ((!currentResolvedConfig.hasPath(key)) || (!currentResolvedConfig.getString(key).equals(value) && !doNotOverride.contains(key))) { if (!FILTER_KEYS.contains(key)) { somethingChanged = true; outputConfigMap.put(key, value); } } } catch (ConfigException.NotResolved nre) { // path is unresolved in config, will try again next iteration } } } while (somethingChanged); try { Config outputConfig = ConfigFactory.parseMap(outputConfigMap); Config currentResolvedConfig = outputConfig.withFallback(templateConfig).resolve(); String rendered = outputConfig.root().render(configRenderOptions); Path newPath = PathUtils.removeExtension(pullFile.getPath(), PullFileLoader.DEFAULT_JAVA_PROPS_PULL_FILE_EXTENSIONS.toArray(new String[] {})); newPath = PathUtils.addExtension(newPath, "conf"); newPath = new Path(this.outputPath, newPath.getName()); FSDataOutputStream os = outputFs.create(newPath); os.write(rendered.getBytes(Charsets.UTF_8)); os.close(); } catch (ConfigException.NotResolved nre) { throw new IOException("Not all configuration keys were resolved in pull file " + pullFile.getPath(), nre); } } }
From source file:gr.ntua.h2rdf.LoadTriples.DistinctIds.java
License:Open Source License
public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException { //io.compression.codecs Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); Configuration conf = new Configuration(); Path blockProjection = new Path("blockIds/"); Path translations = new Path("translations/"); Path sample = new Path("sample/"); Path temp = new Path("temp/"); Path uniqueIds = new Path("uniqueIds/"); FileSystem fs; try {//from w w w.ja v a 2 s. co m fs = FileSystem.get(conf); if (fs.exists(uniqueIds)) { fs.delete(uniqueIds, true); } if (fs.exists(translations)) { fs.delete(translations, true); } if (fs.exists(blockProjection)) { fs.delete(blockProjection, true); } if (fs.exists(sample)) { fs.delete(sample, true); } if (fs.exists(temp)) { fs.delete(temp, true); } FileOutputFormat.setOutputPath(job, uniqueIds); Path inp = new Path(args[0]); FileInputFormat.setInputPaths(job, inp); double type = 1; double datasetSize = 0; if (fs.isFile(inp)) { datasetSize = fs.getFileStatus(inp).getLen(); } else if (fs.isDirectory(inp)) { FileStatus[] s = fs.listStatus(inp); for (int i = 0; i < s.length; i++) { if (s[i].getPath().getName().toString().endsWith(".gz")) type = 27; if (s[i].getPath().getName().toString().endsWith(".snappy")) type = 10; datasetSize += s[i].getLen(); } } else { FileStatus[] s = fs.globStatus(inp); for (int i = 0; i < s.length; i++) { if (s[i].getPath().getName().toString().endsWith(".gz")) type = 27; if (s[i].getPath().getName().toString().endsWith(".snappy")) type = 10; datasetSize += s[i].getLen(); } } datasetSize = datasetSize * type; System.out.println("type: " + type); System.out.println("datasetSize: " + datasetSize); samplingRate = (double) sampleChunk / (double) datasetSize; if (samplingRate >= 0.1) { samplingRate = 0.1; } if (samplingRate <= 0.001) { samplingRate = 0.001; } numReducers = (int) (datasetSize / ReducerChunk); if (numReducers == 0) numReducers = 1; numReducers++; } catch (IOException e) { e.printStackTrace(); } HBaseAdmin hadmin = new HBaseAdmin(conf); HTableDescriptor desc = new HTableDescriptor(TABLE_NAME); HColumnDescriptor family = new HColumnDescriptor("counter"); desc.addFamily(family); if (!hadmin.tableExists(TABLE_NAME)) { hadmin.createTable(desc); } job.setNumReduceTasks(numReducers); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(DistinctIds.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(SamplingPartitioner.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.getConfiguration().set("mapred.compress.map.output", "true"); job.getConfiguration().set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //job.setCombinerClass(Combiner.class); job.setJobName("Distinct Id Wordcount"); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); return job; }
From source file:hadoop.TestingDriver.java
License:Open Source License
public static Configuration addPathToDC(Configuration conf, String path) throws IOException { FileSystem fs = FileSystem.get(conf); FileStatus[] fstatus = fs.globStatus(new Path(path)); Path[] listedPaths = FileUtil.stat2Paths(fstatus); for (Path p : listedPaths) { System.out.println(" Add File to DC " + p.toUri().toString()); DistributedCache.addCacheFile(p.toUri(), conf); }//from w ww. j a v a 2 s .co m return conf; }
From source file:hitune.analysis.mapreduce.processor.FileFilter.FileFilter.java
License:Apache License
/** * Simply scan all those files under the path recursively * @param path//from www. jav a 2s.c o m * @param files */ public void scan(Path path, StringBuilder files) { //log.debug("parentpath: " + path.toString()); if (files == null) { log.error("The files[StringBuilder] object isn't initialized"); return; } try { //log.debug("pattern: " + pattern); FileSystem fs = path.getFileSystem(conf); FileStatus[] fstats = null; fstats = fs.globStatus(new Path(path.toString() + "/*")); for (FileStatus fstat : fstats) { //log.debug("current file/folder: "+ fstat.getPath().toString()); if (fstat.isDir()) { scan(fstat.getPath(), files); } else { FileStatus[] rst = null; if (pattern == null || pattern.equals("") || pattern.length() == 0) { rst = fs.globStatus(fstat.getPath()); } else { rst = fs.globStatus(fstat.getPath(), new regpatternFilter()); } if (rst != null && rst.length != 0) { String filepath = rst[0].getPath().toString(); if (files.length() == 0) { files.append(filepath); } else { files.append(SEPARATOR).append(filepath); } } } } } catch (IOException e) { // TODO Auto-generated catch block log.error("Cannot do the file system operation: " + path.toString()); e.printStackTrace(); } }
From source file:IndexService.IColumnInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path tmpPath = null;//from w ww. j a v a2 s . co m FileSystem fs = FileSystem.get(job); List<IColumnInputSplit> splits = new ArrayList<IColumnInputSplit>(); HashMap<String, FileStatus> files = new HashMap<String, FileStatus>(); String[] inputfiles = job.getStrings("mapred.input.dir"); for (String file : inputfiles) { FileStatus[] fss = fs.globStatus(new Path(file + "_idx*")); FileStatus status = null; long length = 0; for (FileStatus ss : fss) { if (ss.getLen() > length) { length = ss.getLen(); status = ss; } } files.put(file, status); } for (String filekey : files.keySet()) { FileStatus file = files.get(filekey); Path path = file.getPath(); Path keypath = new Path(filekey); long length = file.getLen(); tmpPath = keypath; BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (blkLocations.length <= 1) { IColumnInputSplit split = new IColumnInputSplit(keypath, length, blkLocations[0].getHosts()); splits.add(split); } else { String filename = path.toString(); IFormatDataFile ifd = new IFormatDataFile(job); ifd.open(filename); ISegmentIndex segmentIndex = ifd.segIndex(); for (int i = 0; i < segmentIndex.getSegnum(); i++) { IColumnInputSplit split = new IColumnInputSplit(keypath, segmentIndex.getseglen(i), segmentIndex.getILineIndex(i).beginline(), segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1, blkLocations[i].getHosts()); splits.add(split); } ifd.close(); } } if (splits.size() == 0) { splits.add(new IColumnInputSplit(tmpPath, 0, 0, 0, new String[0])); } System.out.println("Total # of splits: " + splits.size()); return splits.toArray(new IColumnInputSplit[splits.size()]); }
From source file:io.covert.dns.util.DumpResponses.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Text key = new Text(); BytesWritable val = new BytesWritable(); FileStatus[] listing;//from w w w . java 2 s .c om Path inpath = new Path(args[0]); if (fs.getFileStatus(inpath) != null && fs.getFileStatus(inpath).isDir()) listing = fs.listStatus(inpath); else listing = fs.globStatus(inpath); for (FileStatus f : listing) { if (f.isDir() || f.getPath().getName().startsWith("_")) continue; System.out.println("Opennning " + f.getPath() + " ..."); SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf); while (reader.next(key, val)) { Message msg = new Message(val.getBytes()); System.out.println(key + ": " + msg); System.out.println("---"); } reader.close(); } return 0; }
From source file:io.hops.erasure_coding.TestErasureCodingManagerEndless.java
License:Apache License
@Override public void setUp() throws Exception { cluster = new MiniDFSCluster.Builder(getConfig()).numDataNodes(NUMBER_OF_DATANODES).build(); cluster.waitActive();//ww w .j av a2 s . c o m fs = cluster.getFileSystem(); FileSystem fs = getFileSystem(); FileStatus[] files = fs.globStatus(new Path("/*")); for (FileStatus file : files) { fs.delete(file.getPath(), true); } }