List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:acromusashi.stream.bolt.hdfs.HdfsPreProcessor.java
License:Open Source License
/** * HDFSSink????<br>/*w w w . ja v a2 s .co m*/ * ????????????????? * * @param hdfs * @param baseUrl URL * @param baseName ?? * @param tmpSuffix ?? */ public static void execute(FileSystem hdfs, String baseUrl, String baseName, String tmpSuffix) { String baseRealUrl = baseUrl; if (baseRealUrl.endsWith("/") == false) { baseRealUrl = baseRealUrl + "/"; } String targetPattern = baseRealUrl + baseName + "[0-9]*" + tmpSuffix + "*"; Path targetPathPattern = new Path(targetPattern); FileStatus[] targetTmpFiles = null; try { targetTmpFiles = hdfs.globStatus(targetPathPattern); } catch (IOException ioex) { logger.warn("Failed to search preprocess target files. Skip preprocess.", ioex); return; } if (targetTmpFiles.length == 0) { String logFormat = "Preprocess target files not exist. Path={0}"; String logMessage = MessageFormat.format(logFormat, targetPattern); logger.info(logMessage); return; } if (logger.isInfoEnabled() == true) { printTargetPathList(targetTmpFiles); } for (FileStatus targetTmpFile : targetTmpFiles) { renameTmpFile(hdfs, targetTmpFile.getPath().toString(), tmpSuffix); } }
From source file:bigfat.hadoop.HDFSDirInputStream.java
License:Apache License
/** * Create a input stream that will read through all the files in one * directory note that the file will be sorted by name, using the * comparator.//from w w w .j a v a 2 s .c om * * @param fs * @param dir * @param comp * @throws IOException */ public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException { this.fs = fs; Path p = new Path(dir); FileStatus fstate = fs.getFileStatus(p); if (fstate.isDir()) { FileStatus[] child = fs.globStatus(new Path(dir + "/*")); LinkedList<String> s = new LinkedList<String>(); Map<String, Path> map = new HashMap<String, Path>(); for (FileStatus c : child) { if (c.isDir()) continue; map.put(c.getPath().getName(), c.getPath()); s.add(c.getPath().getName()); } if (comp != null) Collections.sort(s, comp); else Collections.sort(s); Iterator<String> it = s.iterator(); while (it.hasNext()) { String n = it.next(); Path pr = map.get(n); this.appendFile(pr.toString()); } } else { this.appendFile(dir); } }
From source file:cascading.tap.GlobHfs.java
License:Open Source License
private Tap[] makeTaps(JobConf conf) throws IOException { FileStatus[] statusList = null;/* w ww. jav a 2 s .co m*/ Path path = new Path(pathPattern); FileSystem fileSystem = path.getFileSystem(conf); if (pathFilter == null) statusList = fileSystem.globStatus(path); else statusList = fileSystem.globStatus(path, pathFilter); if (statusList == null || statusList.length == 0) throw new TapException("unable to find paths matching path pattern: " + pathPattern); List<Hfs> notEmpty = new ArrayList<Hfs>(); for (int i = 0; i < statusList.length; i++) { // remove empty files. turns out a directory returns a length not zero // so this jives with the expectations set in the above javadoc if (statusList[i].getLen() != 0) notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString())); } if (notEmpty.isEmpty()) throw new TapException("all paths matching path pattern are zero length: " + pathPattern); return notEmpty.toArray(new Tap[notEmpty.size()]); }
From source file:cascading.tap.hadoop.BaseDistCacheTap.java
License:Open Source License
@Override public TupleEntryIterator openForRead(FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException { // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is provided if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) { LOG.info("delegating to parent"); return super.openForRead(flowProcess, input); }//from ww w . java 2 s . com Path[] cachedFiles = getLocalCacheFiles(flowProcess); if (cachedFiles == null || cachedFiles.length == 0) return super.openForRead(flowProcess, null); List<Path> paths = new ArrayList<>(); List<Tap> taps = new ArrayList<>(); if (isSimpleGlob()) { FileSystem fs = FileSystem.get(flowProcess.getConfig()); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); for (FileStatus status : statuses) paths.add(status.getPath()); } else { paths.add(getHfs().getPath()); } for (Path pathToFind : paths) { for (Path path : cachedFiles) { if (path.toString().endsWith(pathToFind.getName())) { LOG.info("found {} in distributed cache", path); taps.add(new Lfs(getScheme(), path.toString())); } } } if (paths.isEmpty()) // not in cache, read from HDFS { LOG.info("could not find files in local resource path. delegating to parent: {}", super.getIdentifier()); return super.openForRead(flowProcess, input); } return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input); }
From source file:cascading.tap.hadoop.BaseDistCacheTap.java
License:Open Source License
private void registerHfs(FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs) throws IOException { if (isSimpleGlob()) { FileSystem fs = FileSystem.get(conf); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); if (statuses == null || statuses.length == 0) throw new TapException(String.format( "glob expression %s does not match any files on the filesystem", getHfs().getPath())); for (FileStatus fileStatus : statuses) registerURI(conf, fileStatus.getPath()); } else {/*from w ww . j a va2 s. co m*/ registerURI(conf, hfs.getPath()); } hfs.sourceConfInitComplete(process, conf); }
From source file:cascading.tap.hadoop.GlobHfs.java
License:Open Source License
private Hfs[] makeTaps(Configuration conf) throws IOException { FileStatus[] statusList;/*from www. ja v a2 s .c o m*/ Path path = new Path(pathPattern); FileSystem fileSystem = path.getFileSystem(conf); if (pathFilter == null) statusList = fileSystem.globStatus(path); else statusList = fileSystem.globStatus(path, pathFilter); if (statusList == null || statusList.length == 0) throw new TapException("unable to find paths matching path pattern: " + pathPattern); List<Hfs> notEmpty = new ArrayList<Hfs>(); for (int i = 0; i < statusList.length; i++) { // remove empty files. some hadoop versions return non-zero for dirs // so this jives with the expectations set in the above javadoc if (statusList[i].isDir() || statusList[i].getLen() != 0) notEmpty.add(new Hfs(getScheme(), statusList[i].getPath().toString())); } if (notEmpty.isEmpty()) throw new TapException( "all paths matching path pattern are zero length and not directories: " + pathPattern); return notEmpty.toArray(new Hfs[notEmpty.size()]); }
From source file:com.anhth12.lambda.BatchUpdateFunction.java
@Override public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws Exception { if (newData.take(1).isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return null; }/*from w w w . j a va2 s . co m*/ log.info("Beginning update at {}", timestamp); Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass, messageWritableClass); pastData = pastWriteableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, producer); } return null; }
From source file:com.anhth12.lambda.BatchUpdateFunction2.java
@Override public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception { if (newData.take(1).isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return null; }/*from w w w . jav a 2 s.c om*/ log.info("Beginning update at {}", timestamp); JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() { @Override public Tuple2<K, M> call(MessageAndMetadata t) throws Exception { return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload())); } }); Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass, messageWritableClass); pastData = pastWriteableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString, producer); } return null; }
From source file:com.asakusafw.bulkloader.cache.CacheBuildTest.java
License:Apache License
private List<TestDataModel> collect(CacheStorage storage, Path contents) throws IOException { List<TestDataModel> results = new ArrayList<>(); FileSystem fs = storage.getFileSystem(); for (FileStatus status : fs.globStatus(contents)) { results.addAll(collectContent(fs, status)); }/*from w ww. j ava 2s. c om*/ Collections.sort(results); return results; }
From source file:com.asakusafw.bulkloader.collector.ExportFileSend.java
License:Apache License
/** * ????TSV??/*from w ww. j av a 2 s. c o m*/ * {@link com.asakusafw.bulkloader.transfer.FileList.Writer}???? * @param <T> ? * @param targetTableModel Export??Model? * @param filePath Export * @param writer ?Writer * @param tableName ?? * @return ?????????????????? -1 * @throws BulkLoaderSystemException ?????? */ protected <T extends Writable> long send(Class<T> targetTableModel, String filePath, FileList.Writer writer, String tableName) throws BulkLoaderSystemException { FileSystem fs = null; String fileName = null; // ?? long maxSize = Long.parseLong(ConfigurationLoader.getProperty(Constants.PROP_KEY_EXP_LOAD_MAX_SIZE)); try { TsvIoFactory<T> factory = new TsvIoFactory<>(targetTableModel); Configuration conf = new Configuration(); fs = FileSystem.get(new URI(filePath), conf); // ????? FileStatus[] status = fs.globStatus(new Path(filePath)); Path[] listedPaths = FileUtil.stat2Paths(status); if (listedPaths == null) { LOG.info("TG-COLLECTOR-02006", tableName, filePath); return -1; } else { LOG.info("TG-COLLECTOR-02007", listedPaths.length, tableName, filePath); } long count = 0; boolean addEntry = false; for (Path path : listedPaths) { // ????? if (isSystemFile(path)) { continue; } // TODO ???? // ?????? ModelInput<T> input = TemporaryStorage.openInput(conf, targetTableModel, path); try { while (true) { // addEntry = true; fileName = FileNameUtil.createSendExportFileName(tableName, fileNameMap); OutputStream output = writer.openNext(FileList.content(fileName)); try { CountingOutputStream counter = new CountingOutputStream(output); ModelOutput<T> modelOut = factory.createModelOutput(counter); T model = factory.createModelObject(); LOG.info("TG-COLLECTOR-02004", tableName, path.toString(), fileName); // ???ModelTSV?? boolean nextFile = false; while (input.readTo(model)) { // Modol??? modelOut.write(model); count++; // ??????? // char?byte????????? // ??????(????) if (counter.getByteCount() > maxSize) { nextFile = true; break; } } modelOut.close(); LOG.info("TG-COLLECTOR-02005", tableName, path.toString(), fileName); if (nextFile) { // ??????? continue; } else { // ???????? break; } } finally { output.close(); } } } finally { input.close(); } } if (addEntry) { return count; } else { assert count == 0; return -1; } } catch (IOException e) { throw new BulkLoaderSystemException(e, getClass(), "TG-COLLECTOR-02001", MessageFormat .format("HDFS?{0} ???{1}", filePath, fileName)); } catch (URISyntaxException e) { throw new BulkLoaderSystemException(e, getClass(), "TG-COLLECTOR-02001", MessageFormat.format("HDFS???HDFS?{0}", filePath)); } finally { if (fs != null) { try { fs.close(); } catch (IOException e) { throw new BulkLoaderSystemException(e, this.getClass(), "TG-COLLECTOR-02001", MessageFormat.format( "HDFS???URI{0}", filePath)); } } } }