List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:com.inmobi.conduit.distcp.tools.TestIntegration.java
License:Apache License
@Test public void testJobConters() { try {//from w w w .j a v a2 s . c o m Path listFile = new Path("target/tmp1/listing").makeQualified(fs); addEntries(listFile, "*"); createFileForAudit("/conduit/streams/test1/2013/10/10/10/10/file1.gz"); runTest(listFile, target, true); int numberOfCountersPerFile = 0; long sumOfCounterValues = 0; FileStatus[] statuses = fs.listStatus(counterOutputPath, new PathFilter() { public boolean accept(Path path) { return path.toString().contains("part"); } }); for (FileStatus status : statuses) { Scanner scanner = new Scanner(fs.open(status.getPath())); while (scanner.hasNext()) { String counterNameValue = null; try { counterNameValue = scanner.next(); String tmp[] = counterNameValue.split(ConduitConstants.AUDIT_COUNTER_NAME_DELIMITER); Assert.assertEquals(4, tmp.length); Long numOfMsgs = Long.parseLong(tmp[3]); numberOfCountersPerFile++; sumOfCounterValues += numOfMsgs; } catch (Exception e) { LOG.error("Counters file has malformed line with counter name = " + counterNameValue + " ..skipping the line ", e); } } } // should have 2 conters per file Assert.assertEquals(2, numberOfCountersPerFile); // sum of all counter values should equal to total number of messages Assert.assertEquals(3, sumOfCounterValues); checkResult(target, 1); } catch (IOException e) { LOG.error("Exception encountered while testing distcp", e); Assert.fail("distcp failure"); } finally { TestDistCpUtils.delete(fs, root); } }
From source file:com.inmobi.conduit.local.LocalStreamServiceTest.java
License:Apache License
private void testClusterName(String configName, String currentClusterName) throws Exception { ConduitConfigParser parser = new ConduitConfigParser(configName); ConduitConfig config = parser.getConfig(); Set<String> streamsToProcess = new HashSet<String>(); streamsToProcess.addAll(config.getSourceStreams().keySet()); Set<String> clustersToProcess = new HashSet<String>(); Set<TestLocalStreamService> services = new HashSet<TestLocalStreamService>(); Cluster currentCluster = null;//from w ww.ja va 2 s . co m for (SourceStream sStream : config.getSourceStreams().values()) { for (String cluster : sStream.getSourceClusters()) { clustersToProcess.add(cluster); } } if (currentClusterName != null) { currentCluster = config.getClusters().get(currentClusterName); } for (String clusterName : clustersToProcess) { Cluster cluster = config.getClusters().get(clusterName); cluster.getHadoopConf().set("mapred.job.tracker", super.CreateJobConf().get("mapred.job.tracker")); TestLocalStreamService service = new TestLocalStreamService(config, cluster, currentCluster, new NullCheckPointProvider(), streamsToProcess); services.add(service); } for (TestLocalStreamService service : services) { FileSystem fs = service.getFileSystem(); service.preExecute(); if (currentClusterName != null) Assert.assertEquals(service.getCurrentCluster().getName(), currentClusterName); // creating a job with empty input path Path tmpJobInputPath = new Path("/tmp/job/input/path"); Map<FileStatus, String> fileListing = new TreeMap<FileStatus, String>(); Set<FileStatus> trashSet = new HashSet<FileStatus>(); // checkpointKey, CheckPointPath Table<String, String, String> checkpointPaths = HashBasedTable.create(); service.createMRInput(tmpJobInputPath, fileListing, trashSet, checkpointPaths); Job testJobConf = service.createJob(tmpJobInputPath, 1000); testJobConf.waitForCompletion(true); int numberOfCountersPerFile = 0; long sumOfCounterValues = 0; Path outputCounterPath = new Path(new Path(service.getCluster().getTmpPath(), service.getName()), "counters"); FileStatus[] statuses = fs.listStatus(outputCounterPath, new PathFilter() { public boolean accept(Path path) { return path.toString().contains("part"); } }); for (FileStatus fileSt : statuses) { Scanner scanner = new Scanner(fs.open(fileSt.getPath())); while (scanner.hasNext()) { String counterNameValue = null; try { counterNameValue = scanner.next(); String tmp[] = counterNameValue.split(ConduitConstants.AUDIT_COUNTER_NAME_DELIMITER); Assert.assertEquals(4, tmp.length); Long numOfMsgs = Long.parseLong(tmp[3]); numberOfCountersPerFile++; sumOfCounterValues += numOfMsgs; } catch (Exception e) { LOG.error("Counters file has malformed line with counter name =" + counterNameValue + "..skipping the line", e); } } } // Should have 2 counters for each file Assert.assertEquals(NUMBER_OF_FILES * 2, numberOfCountersPerFile); // sum of all counter values should be equal to total number of messages Assert.assertEquals(NUMBER_OF_FILES * 3, sumOfCounterValues); Assert.assertEquals(testJobConf.getConfiguration().get(FS_DEFAULT_NAME_KEY), service.getCurrentCluster().getHadoopConf().get(FS_DEFAULT_NAME_KEY)); Assert.assertEquals(testJobConf.getConfiguration().get(SRC_FS_DEFAULT_NAME_KEY), service.getCluster().getHadoopConf().get(FS_DEFAULT_NAME_KEY)); if (currentCluster == null) Assert.assertEquals(testJobConf.getConfiguration().get(FS_DEFAULT_NAME_KEY), testJobConf.getConfiguration().get(SRC_FS_DEFAULT_NAME_KEY)); service.getFileSystem().delete(new Path(service.getCluster().getRootDir()), true); } }
From source file:com.inmobi.databus.readers.CollectorStreamReader.java
License:Apache License
protected FileMap<CollectorFile> createFileMap() throws IOException { return new FileMap<CollectorFile>() { @Override//from w ww.j av a2 s. com protected PathFilter createPathFilter() { return new PathFilter() { @Override public boolean accept(Path p) { if (p.getName().endsWith("_current") || p.getName().endsWith("_stats")) { return false; } return true; } }; } /* * prepare a fileMap with files which are beyond the stopTime */ @Override protected void buildList() throws IOException { if (fsIsPathExists(streamDir)) { FileStatus[] fileStatuses = fsListFileStatus(streamDir, pathFilter); if (fileStatuses == null || fileStatuses.length == 0) { LOG.info("No files in directory:" + streamDir); return; } if (stopTime == null) { for (FileStatus file : fileStatuses) { addPath(file); } } else { for (FileStatus file : fileStatuses) { Date currentTimeStamp = getDateFromCollectorFile(file.getPath().getName()); if (stopTime.before(currentTimeStamp)) { stopListing(); continue; } addPath(file); } } } else { LOG.info("Collector directory does not exist"); } } @Override protected TreeMap<CollectorFile, FileStatus> createFilesMap() { return new TreeMap<CollectorFile, FileStatus>(); } @Override protected CollectorFile getStreamFile(String fileName) { return CollectorFile.create(fileName); } @Override protected CollectorFile getStreamFile(FileStatus file) { return CollectorFile.create(file.getPath().getName()); } }; }
From source file:com.inmobi.databus.readers.DatabusStreamWaitingReader.java
License:Apache License
@Override protected FileMap<HadoopStreamFile> createFileMap() throws IOException { return new FileMap<HadoopStreamFile>() { @Override//from w w w .j a v a 2 s . c o m protected void buildList() throws IOException { buildListing(this, pathFilter); } @Override protected TreeMap<HadoopStreamFile, FileStatus> createFilesMap() { return new TreeMap<HadoopStreamFile, FileStatus>(); } @Override protected HadoopStreamFile getStreamFile(String fileName) { throw new RuntimeException("Not implemented"); } @Override protected HadoopStreamFile getStreamFile(FileStatus file) { return HadoopStreamFile.create(file); } @Override protected PathFilter createPathFilter() { return new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().startsWith("_")) { return false; } return true; } }; } }; }
From source file:com.inmobi.databus.readers.LocalStreamCollectorReader.java
License:Apache License
public FileMap<DatabusStreamFile> createFileMap() throws IOException { return new FileMap<DatabusStreamFile>() { @Override/*w w w . ja va2 s. c o m*/ protected void buildList() throws IOException { buildListing(this, pathFilter); } @Override protected TreeMap<DatabusStreamFile, FileStatus> createFilesMap() { return new TreeMap<DatabusStreamFile, FileStatus>(); } @Override protected DatabusStreamFile getStreamFile(String fileName) { return DatabusStreamFile.create(streamName, fileName); } @Override protected DatabusStreamFile getStreamFile(FileStatus file) { return DatabusStreamFile.create(streamName, file.getPath().getName()); } @Override protected PathFilter createPathFilter() { return new PathFilter() { @Override public boolean accept(Path p) { if (p.getName().startsWith(collector)) { return true; } return false; } }; } }; }
From source file:com.jkoolcloud.tnt4j.streams.inputs.HdfsFileLineStream.java
License:Apache License
/** * Searches for files matching name pattern. Name pattern also may contain path of directory, where file search * should be performed, e.g., C:/Tomcat/logs/localhost_access_log.*.txt. If no path is defined (just file name * pattern) then files are searched in {@code System.getProperty("user.dir")}. Files array is ordered by file create * timestamp in descending order./*from w ww. j a va2s . c o m*/ * * @param path * path of file * @param fs * file system * * @return array of found files paths. * @throws IOException * if files can't be listed by file system. * * @see FileSystem#listStatus(Path, PathFilter) * @see FilenameUtils#wildcardMatch(String, String, IOCase) */ public static Path[] searchFiles(Path path, FileSystem fs) throws IOException { FileStatus[] dir = fs.listStatus(path.getParent(), new PathFilter() { @Override public boolean accept(Path path) { String name = path.getName(); return FilenameUtils.wildcardMatch(name, "*", IOCase.INSENSITIVE); // NON-NLS } }); Path[] activityFiles = new Path[dir == null ? 0 : dir.length]; if (dir != null) { Arrays.sort(dir, new Comparator<FileStatus>() { @Override public int compare(FileStatus o1, FileStatus o2) { return Long.valueOf(o1.getModificationTime()).compareTo(o2.getModificationTime()) * (-1); } }); for (int i = 0; i < dir.length; i++) { activityFiles[i] = dir[i].getPath(); } } return activityFiles; }
From source file:com.linkedin.cubert.io.rubix.RubixFile.java
License:Open Source License
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, ParseException, InstantiationException, IllegalAccessException { final int VERBOSE_NUM_ROWS = 4; Options options = new Options(); options.addOption("h", "help", false, "shows this message"); options.addOption("v", "verbose", false, "print summary and first few rows of each block"); options.addOption("m", "metadata", false, "show the metadata"); options.addOption("d", "dump", false, "dump the contents of the rubix file. Use -f for specifying format, and -o for specifying output location"); options.addOption("f", "format", true, "the data format for dumping data (AVRO or TEXT). Default: TEXT"); options.addOption("e", "extract", true, "Extract one rubix block matching the block id. Use -o for specifying output location"); options.addOption("o", true, "Store the output at the specified location"); CommandLineParser parser = new BasicParser(); // parse the command line arguments CommandLine line = parser.parse(options, args); // show the help message if (line.hasOption("h")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(/*from ww w . ja va 2s. co m*/ "RubixFile <rubix file or dir> [options]\nIf no options are provided, print a summary of the blocks.", options); return; } // validate provided options if (line.hasOption("d") && line.hasOption("e")) { System.err.println("Cannot dump (-d) and extract (-e) at the same time!"); return; } // obtain the list of rubix files String[] files = line.getArgs(); if (files == null || files.length == 0) { System.err.println("Rubix file not specified"); return; } Configuration conf = new JobConf(); FileSystem fs = FileSystem.get(conf); Path path = new Path(files[0]); FileStatus[] allFiles; FileStatus status = fs.getFileStatus(path); if (status.isDir()) { allFiles = fs.listStatus(path, new PathFilter() { @Override public boolean accept(Path path) { return path.toString().contains(RubixConstants.RUBIX_EXTENSION); } }); } else { allFiles = new FileStatus[] { status }; } // walk over all files and extract the trailer section List<RubixFile<Tuple, Object>> rfiles = new ArrayList<RubixFile<Tuple, Object>>(); for (FileStatus s : allFiles) { Path p = s.getPath(); RubixFile<Tuple, Object> rfile = new RubixFile<Tuple, Object>(conf, p); // if printing meta data information.. exit after first file (since all files // have the same meta data) if (line.hasOption("m")) { rfile.getKeyData(); System.out.println(new ObjectMapper().writer().writeValueAsString(rfile.metadataJson)); break; } rfiles.add(rfile); } // dump the data if (line.hasOption("d")) { String format = line.getOptionValue("f"); if (format == null) format = "TEXT"; format = format.trim().toUpperCase(); if (format.equals("AVRO")) { // dumpAvro(rfiles, line.getOptionValue("o")); throw new UnsupportedOperationException( "Dumping to avro is not currently supporting. Please write a Cubert (map-only) script to store data in avro format"); } else if (format.equals("TEXT")) { if (line.hasOption("o")) { System.err.println("Dumping TEXT format data *into a file* is not currently supported"); return; } dumpText(rfiles, line.getOptionValue("o"), Integer.MAX_VALUE); } else { System.err.println("Invalid format [" + format + "] for dumping. Please use AVRO or TEXT"); return; } } else if (line.hasOption("e")) // extract one rubix block { long blockId = Long.parseLong(line.getOptionValue("e")); extract(rfiles, blockId, line.getOptionValue("o")); } else // print summary { dumpText(rfiles, null, line.hasOption("v") ? VERBOSE_NUM_ROWS : 0); } }
From source file:com.linkedin.mapred.AvroUtils.java
License:Open Source License
public static FileStatus[] getAvroPartFiles(JobConf conf, Path outPath) throws IOException { Path outputPath = outPath;/* w ww . j a v a 2 s . c o m*/ FileSystem fileSystem = outputPath.getFileSystem(conf); FileStatus[] partFiles = fileSystem.listStatus(outputPath, new PathFilter() { @Override public boolean accept(Path path) { if (path.getName().endsWith(".avro")) { return true; } return false; } }); return partFiles; }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
/** * Returns the output from {@link CrushReducer}. Each reducer writes out a mapping of source files to crush output file. */// www . j a v a2 s . c om private List<FileStatus> getOutputMappings() throws IOException { try { FileStatus[] files = fs.listStatus(outDir, new PathFilter() { Matcher matcher = Pattern.compile("part-\\d+").matcher("dummy"); @Override public boolean accept(Path path) { matcher.reset(path.getName()); return matcher.matches(); } }); return asList(files); } catch (FileNotFoundException e) { return new LinkedList<FileStatus>(); } }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
void writeDirs() throws IOException { print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n"); FileStatus status = fs.getFileStatus(srcDir); Path tmpIn = new Path(tmpDir, "in"); bucketFiles = new Path(tmpIn, "dirs"); partitionMap = new Path(tmpIn, "partition-map"); counters = new Path(tmpIn, "counters"); skippedFiles = new HashSet<String>(); removableFiles = new HashSet<String>(); /*/*w w w. j a v a 2 s. c o m*/ * Prefer the path returned by the status because it is always fully qualified. */ List<Path> dirs = asList(status.getPath()); Text key = new Text(); Text value = new Text(); Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false); partitionBucketer.reset("partition-map"); jobCounters = new Counters(); int fileCount = 0; //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++); Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class, CompressionType.BLOCK); try { while (!dirs.isEmpty()) { List<Path> nextLevel = new LinkedList<Path>(); for (Path dir : dirs) { String dirPath = dir.toUri().getPath(); print(Verbosity.INFO, "\n\n[" + dirPath + "]"); jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); FileStatus[] contents = fs.listStatus(dir, new PathFilter() { @Override public boolean accept(Path testPath) { if (ignoredFilesMatcher == null) return true; ignoredFilesMatcher.reset(testPath.toUri().getPath()); boolean ignores = ignoredFilesMatcher.matches(); if (ignores) LOG.info("Ignoring file " + testPath); return !ignores; } }); if (contents == null || contents.length == 0) { print(Verbosity.INFO, "\n Directory is empty"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length); Set<String> uncrushedFiles = new HashSet<String>(contents.length); long crushableBytes = 0; /* * Queue sub directories for subsequent inspection and examine the files in this directory. */ for (FileStatus content : contents) { Path path = content.getPath(); if (content.isDir()) { nextLevel.add(path); } else { String filePath = path.toUri().getPath(); boolean skipFile = false; if (skippedFilesMatcher != null) { skippedFilesMatcher.reset(filePath); if (skippedFilesMatcher.matches()) { skipFile = true; } } boolean changed = uncrushedFiles.add(filePath); assert changed : path.toUri().getPath(); long fileLength = content.getLen(); if (!skipFile && fileLength <= maxEligibleSize) { if (removeEmptyFiles && fileLength == 0) removableFiles.add(filePath); else { crushables.add(content); crushableBytes += fileLength; } } } } /* * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the * number of files we found. */ if (!uncrushedFiles.isEmpty()) { if (-1 == findMatcher(dir)) { throw new IllegalArgumentException( "Could not find matching regex for directory: " + dir); } jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size()); } if (0 == crushableBytes) { print(Verbosity.INFO, "\n Directory has no crushable files"); jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); } else { /* * We found files to consider for crushing. */ long nBlocks = crushableBytes / dfsBlockSize; if (nBlocks * dfsBlockSize != crushableBytes) { nBlocks++; } /* * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory. */ long dirBuckets = nBlocks / maxFileBlocks; if (dirBuckets * maxFileBlocks != nBlocks) { dirBuckets++; } if (dirBuckets > Integer.MAX_VALUE) { throw new AssertionError("Too many buckets: " + dirBuckets); } Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs); directoryBucketer.reset(getPathPart(dir)); for (FileStatus file : crushables) { directoryBucketer.add(new FileStatusHasSize(file)); } List<Bucket> crushFiles = directoryBucketer.createBuckets(); if (crushFiles.isEmpty()) { jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); print(Verbosity.INFO, "\n Directory skipped"); } else { nBuckets += crushFiles.size(); jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); print(Verbosity.INFO, "\n Generating " + crushFiles.size() + " output files"); /* * Write out the mapping between a bucket and a file. */ for (Bucket crushFile : crushFiles) { String bucketId = crushFile.name(); List<String> filesInBucket = crushFile.contents(); print(Verbosity.INFO, format("\n Output %s will include %,d input bytes from %,d files", bucketId, crushFile.size(), filesInBucket.size())); key.set(bucketId); for (String f : filesInBucket) { boolean changed = uncrushedFiles.remove(f); assert changed : f; pathMatcher.reset(f); pathMatcher.matches(); value.set(pathMatcher.group(5)); /* * Write one row per file to maximize the number of mappers */ writer.append(key, value); /* * Print the input file with four leading spaces. */ print(Verbosity.VERBOSE, "\n " + f); } jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size()); partitionBucketer.add(crushFile); } } } if (!removableFiles.isEmpty()) { print(Verbosity.INFO, "\n Marked " + removableFiles.size() + " files for removal"); for (String removable : removableFiles) { uncrushedFiles.remove(removable); print(Verbosity.VERBOSE, "\n " + removable); } jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size()); } if (!uncrushedFiles.isEmpty()) { print(Verbosity.INFO, "\n Skipped " + uncrushedFiles.size() + " files"); for (String uncrushed : uncrushedFiles) { print(Verbosity.VERBOSE, "\n " + uncrushed); } jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size()); } skippedFiles.addAll(uncrushedFiles); } } dirs = nextLevel; } } finally { writer.close(); } /* * Now that we have processed all the directories, write the partition map. */ List<Bucket> partitions = partitionBucketer.createBuckets(); assert partitions.size() <= maxTasks; writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class); IntWritable partNum = new IntWritable(); int totalReducers = 0; for (Bucket partition : partitions) { String partitionName = partition.name(); int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1)); partNum.set(p); if (partition.contents().size() > 0) totalReducers++; for (String bucketId : partition.contents()) { key.set(bucketId); writer.append(key, partNum); } } writer.close(); print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers); job.setInt("mapreduce.job.reduces", totalReducers); DataOutputStream countersStream = fs.create(this.counters); jobCounters.write(countersStream); countersStream.close(); }