List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:tv.icntv.recommend.algorithm.CorrelateJob.java
License:Apache License
/** * new String[]{// ww w. ja va 2 s.com String.format(configuration.get(sourceProperty),date), middleDirectory, sb.toString(), String.format(configuration.get(targetResultProperty),date) } * @param strings * @return * @throws Exception */ @Override public int run(String[] strings) throws Exception { Configuration configuration = getConf(); Date date = getDateAdd(-1); String middleDirectory = String.format(configuration.get(correlateInputProperty), date); StringBuilder sb = new StringBuilder(); sb.append("minSupport=").append(configuration.get(minSupportProperty, "3")).append(split) .append("maxHeapSize=1024").append(split).append("splitterPattern='[\t ]'").append(split) .append("input=").append(middleDirectory).append(split).append("output=") .append(String.format(configuration.get(fpGrowthProperty), date)); HadoopUtils.deleteIfExist(middleDirectory); Job correlate = new Job(configuration, "???fp-growth"); MapReduceUtils.initMapperJob(UserHistoryMapper.class, Text.class, Text.class, this.getClass(), correlate, getInput(configuration, -1));//new Path(String.format(configuration.get(sourceProperty),date)) // MapReduceUtils.initReducerJob(new Path(middleDirectory), UserHistoryReducer.class, correlate); correlate.setReducerClass(UserHistoryReducer.class); correlate.setOutputKeyClass(NullWritable.class); correlate.setOutputValueClass(Text.class); // correlate.setCombinerClass(UserHistoryCombiner.class); FileOutputFormat.setOutputPath(correlate, new Path(middleDirectory)); if (!correlate.waitForCompletion(true)) { return 1; } ; Parameters parameter = getParameter(sb.toString()); HadoopUtils.deleteIfExist(parameter.get("output")); PFPGrowth.runPFPGrowth(parameter, configuration); String output = parameter.get("output") + "/frequentpatterns"; long count = HadoopUtils.count(new Path(output), new PathFilter() { @Override public boolean accept(Path path) { return path.getName().matches("part-r-\\d*"); } }); if (count == 0) { return 1; } String resultPath = String.format(configuration.get(targetResultProperty), date); configuration.setLong("icntv.correlate.total.size", count); HadoopUtils.deleteIfExist(resultPath); Job result = new Job(configuration, "?"); MapReduceUtils.initMapperJob(CorrelateInputMapper.class, Text.class, Text.class, this.getClass(), result, new Path(output)); result.setInputFormatClass(SequenceFileInputFormat.class); MapReduceUtils.initReducerJob(new Path(resultPath), CorrelateOutPutReducer.class, result); result.waitForCompletion(true); return 0; }
From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java
License:Apache License
/** * Run the job//ww w. j a v a 2 s . c om */ public void build() { try { Job job = new Job(config); job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster)); job.getConfiguration().set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); job.getConfiguration().setBoolean("save.keys", saveKeys); job.getConfiguration().set("final.output.dir", outputDir.toString()); job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType)); job.setPartitionerClass(HadoopStoreBuilderPartitioner.class); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); job.setReducerClass(HadoopStoreBuilderReducer.class); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setJarByClass(getClass()); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, tempDir); FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration()); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration()); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); int numChunks = Math.max( (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes), 1); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes + ", num.chunks = " + numChunks); job.getConfiguration().setInt("num.chunks", numChunks); int numReduces = cluster.getNumberOfNodes() * numChunks; job.setNumReduceTasks(numReduces); logger.info("Number of reduces: " + numReduces); logger.info("Building store..."); job.waitForCompletion(true); ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); else metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { outputFs.mkdirs(nodePath); // Create empty folder } // Write metadata FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata")); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } if (checkSumType != CheckSumType.NONE) { // Generate checksum for every node FileStatus[] nodes = outputFs.listStatus(outputDir); // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digests"); } for (FileStatus node : nodes) { if (node.isDir()) { FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null) { Arrays.sort(storeFiles, new IndexFileLastComparator()); for (FileStatus file : storeFiles) { FSDataInputStream input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); checkSumGenerator.update(fileCheckSum); outputFs.delete(file.getPath(), true); } FSDataOutputStream checkSumStream = outputFs.create( new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt")); checkSumStream.write(checkSumGenerator.getCheckSum()); checkSumStream.flush(); checkSumStream.close(); } } } } } catch (Exception e) { logger.error("Error = " + e); throw new VoldemortException(e); } }
From source file:voldemort.store.readonly.mr.azkaban.AbstractHadoopJob.java
License:Apache License
public JobConf createJobConf(Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = new JobConf(); // set custom class loader with custom find resource strategy. conf.setJobName(getId());//from ww w . jav a2s .co m conf.setMapperClass(mapperClass); conf.setReducerClass(reducerClass); String hadoop_ugi = _props.getString("hadoop.job.ugi", null); if (hadoop_ugi != null) { conf.set("hadoop.job.ugi", hadoop_ugi); } if (_props.getBoolean("is.local", false)) { conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); info("Running locally, no hadoop jar set."); } else { setClassLoaderAndJar(conf, getClass()); info("Setting hadoop jar file for class:" + getClass() + " to " + conf.getJar()); info("*************************************************************************"); info(" Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ") "); info("*************************************************************************"); } // set JVM options if present if (_props.containsKey("mapred.child.java.opts")) { conf.set("mapred.child.java.opts", _props.getString("mapred.child.java.opts")); info("mapred.child.java.opts set to " + _props.getString("mapred.child.java.opts")); } // set input and output paths if they are present if (_props.containsKey("input.paths")) { List<String> inputPaths = _props.getStringList("input.paths"); if (inputPaths.size() == 0) throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'"); for (String path : inputPaths) { // Implied stuff, but good implied stuff if (path.endsWith(LATEST_SUFFIX)) { FileSystem fs = FileSystem.get(conf); PathFilter filter = new PathFilter() { @Override public boolean accept(Path arg0) { return !arg0.getName().startsWith("_") && !arg0.getName().startsWith("."); } }; String latestPath = path.substring(0, path.length() - LATEST_SUFFIX.length()); FileStatus[] statuses = fs.listStatus(new Path(latestPath), filter); Arrays.sort(statuses); path = statuses[statuses.length - 1].getPath().toString(); System.out.println("Using latest folder: " + path); } HadoopUtils.addAllSubPaths(conf, new Path(path)); } } if (_props.containsKey("output.path")) { String location = _props.get("output.path"); if (location.endsWith("#CURRENT")) { DateTimeFormatter format = DateTimeFormat.forPattern(COMMON_FILE_DATE_PATTERN); String destPath = format.print(new DateTime()); location = location.substring(0, location.length() - "#CURRENT".length()) + destPath; System.out.println("Store location set to " + location); } FileOutputFormat.setOutputPath(conf, new Path(location)); // For testing purpose only remove output file if exists if (_props.getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } } // Adds External jars to hadoop classpath String externalJarList = _props.getString("hadoop.external.jarFiles", null); if (externalJarList != null) { String[] jarFiles = externalJarList.split(","); for (String jarFile : jarFiles) { info("Adding extenral jar File:" + jarFile); DistributedCache.addFileToClassPath(new Path(jarFile), conf); } } // Adds distributed cache files String cacheFileList = _props.getString("hadoop.cache.files", null); if (cacheFileList != null) { String[] cacheFiles = cacheFileList.split(","); for (String cacheFile : cacheFiles) { info("Adding Distributed Cache File:" + cacheFile); DistributedCache.addCacheFile(new URI(cacheFile), conf); } } // Adds distributed cache files String archiveFileList = _props.getString("hadoop.cache.archives", null); if (archiveFileList != null) { String[] archiveFiles = archiveFileList.split(","); for (String archiveFile : archiveFiles) { info("Adding Distributed Cache Archive File:" + archiveFile); DistributedCache.addCacheArchive(new URI(archiveFile), conf); } } String hadoopCacheJarDir = _props.getString("hdfs.default.classpath.dir", null); if (hadoopCacheJarDir != null) { FileSystem fs = FileSystem.get(conf); if (fs != null) { FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir)); if (status != null) { for (int i = 0; i < status.length; ++i) { if (!status[i].isDir()) { Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName()); info("Adding Jar to Distributed Cache Archive File:" + path); DistributedCache.addFileToClassPath(path, conf); } } } else { info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty."); } } else { info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " filesystem doesn't exist"); } } // May want to add this to HadoopUtils, but will await refactoring for (String key : getProps().keySet()) { String lowerCase = key.toLowerCase(); if (lowerCase.startsWith(HADOOP_PREFIX)) { String newKey = key.substring(HADOOP_PREFIX.length()); conf.set(newKey, getProps().get(key)); } } HadoopUtils.setPropsInJob(conf, getProps()); return conf; }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java
License:Apache License
/** * Run the job/* w w w .ja va 2 s.c o m*/ */ public void build() { try { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean("save.keys", saveKeys); conf.setBoolean("reducer.per.bucket", reducerPerBucket); if (!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); if (reducerPerBucket) { conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(HadoopStoreBuilderReducer.class); } } conf.setInputFormat(inputFormatClass); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set("checksum.type", CheckSum.toString(checkSumType)); FileOutputFormat.setOutputPath(conf, tempDir); FileSystem outputFs = outputDir.getFileSystem(conf); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(conf); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes); // Derive "rough" number of chunks and reducers int numReducers; if (saveKeys) { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor(); } else { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks; } } else { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions(); } else { numReducers = cluster.getNumberOfPartitions() * numChunks; } } conf.setInt("num.chunks", numChunks); conf.setNumReduceTasks(numReducers); if (isAvro) { conf.setPartitionerClass(AvroStoreBuilderPartitioner.class); // conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(ByteBuffer.class); conf.setMapOutputValueClass(ByteBuffer.class); conf.setInputFormat(inputFormatClass); conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class); conf.setOutputKeyClass(ByteBuffer.class); conf.setOutputValueClass(ByteBuffer.class); // AvroJob confs for the avro mapper AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema"))); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES))); AvroJob.setMapperClass(conf, mapperClass); if (reducerPerBucket) { conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(AvroStoreBuilderReducer.class); } } logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: " + saveKeys + ", reducerPerBucket: " + reducerPerBucket); logger.info("Building store..."); RunningJob job = JobClient.runJob(conf); // Once the job has completed log the counter Counters counters = job.getCounters(); if (saveKeys) { if (reducerPerBucket) { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } else { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } } // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType); } // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); } else { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); } Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { logger.info("No data generated for node " + node.getId() + ". Generating empty folder"); outputFs.mkdirs(nodePath); // Create empty folder outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + nodePath); } if (checkSumType != CheckSumType.NONE) { FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null && storeFiles.length > 0) { Arrays.sort(storeFiles, new IndexFileLastComparator()); FSDataInputStream input = null; for (FileStatus file : storeFiles) { try { input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); logger.debug("Checksum for file " + file.toString() + " - " + new String(Hex.encodeHex(fileCheckSum))); checkSumGenerator.update(fileCheckSum); } catch (Exception e) { logger.error("Error while reading checksum file " + e.getMessage(), e); } finally { if (input != null) input.close(); } outputFs.delete(file.getPath(), false); } metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum())); logger.info("Checksum for node " + node.getId() + " - " + checkSum); metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum); } } // Write metadata Path metadataPath = new Path(nodePath, ".metadata"); FSDataOutputStream metadataStream = outputFs.create(metadataPath); outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + metadataPath); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } } catch (Exception e) { logger.error("Error in Store builder", e); throw new VoldemortException(e); } }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java
License:Apache License
/** * Given a filesystem and path to a node, gets all the data files ( * irrespective of partition, replica, etc ) * /*from www.j ava 2 s .c o m*/ * Works only for {@link ReadOnlyStorageFormat.READONLY_V2} * * @param fs Underlying filesystem * @param path The node directory path * @return Returns list of files of this partition, replicaType * @throws IOException */ public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path) throws IOException { return fs.listStatus(path, new PathFilter() { public boolean accept(Path input) { if (input.getName().matches("^[\\d]+_[\\d]+_[\\d]+\\.data")) { return true; } else { return false; } } }); }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java
License:Apache License
/** * Given a filesystem and path to a node, gets all the files which belong to * a partition and replica type//ww w .j ava 2 s .co m * * Works only for {@link ReadOnlyStorageFormat.READONLY_V2} * * @param fs Underlying filesystem * @param path The node directory path * @param partitionId The partition id for which we get the files * @param replicaType The replica type * @return Returns list of files of this partition, replicaType * @throws IOException */ public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path, final int partitionId, final int replicaType) throws IOException { return fs.listStatus(path, new PathFilter() { public boolean accept(Path input) { if (input.getName().matches("^" + Integer.toString(partitionId) + "_" + Integer.toString(replicaType) + "_[\\d]+\\.data")) { return true; } else { return false; } } }); }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java
License:Apache License
/** * Given a filesystem and path to a node, gets all the files which belong to * a partition, replica type and chunk id * //from w w w .j a v a 2 s . c om * Works only for {@link ReadOnlyStorageFormat.READONLY_V2} * * @param fs Underlying filesystem * @param path The node directory path * @param partitionId The partition id for which we get the files * @param replicaType The replica type * @param chunkId The chunk id * @return Returns list of files of this partition, replicaType, chunkId * @throws IOException */ public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path, final int partitionId, final int replicaType, final int chunkId) throws IOException { return fs.listStatus(path, new PathFilter() { public boolean accept(Path input) { if (input.getName().matches("^" + Integer.toString(partitionId) + "_" + Integer.toString(replicaType) + "_" + Integer.toString(chunkId) + "\\.data")) { return true; } else { return false; } } }); }
From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java
License:Apache License
/** * Looks for the latest (the alphabetically greatest) path contained in the * given directory that passes the specified regex pattern. * /*from ww w . j a va 2 s . c o m*/ * @param fs The file system * @param directory The directory that will contain the versions * @param acceptRegex The String pattern * @return * @throws IOException */ public static Path getLatestVersionedPath(FileSystem fs, Path directory, String acceptRegex) throws IOException { final String pattern = acceptRegex != null ? acceptRegex : "\\S+"; PathFilter filter = new PathFilter() { @Override public boolean accept(Path arg0) { return !arg0.getName().startsWith("_") && Pattern.matches(pattern, arg0.getName()); } }; FileStatus[] statuses = fs.listStatus(directory, filter); if (statuses == null || statuses.length == 0) { return null; } Arrays.sort(statuses); return statuses[statuses.length - 1].getPath(); }
From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java
License:Apache License
/** * Easily cleans up old data (alphabetically least) paths that is accepted * by the regex.//from w w w . j a va2 s. c o m * * @param fs The file system * @param directory The directory that will contain the versions * @param acceptRegex The String pattern * @param backupNumber The number of versions we should keep. Otherwise * we'll clean up. * @return * @throws IOException */ public static void cleanupOlderVersions(FileSystem fs, Path directory, final String acceptRegex, int backupNumber) throws IOException { if (backupNumber < 1) { logger.error("Number of versions must be 1 or greater"); return; } PathFilter filter = new PathFilter() { @Override public boolean accept(Path arg0) { return !arg0.getName().startsWith("_") && Pattern.matches(acceptRegex, arg0.getName()); } }; FileStatus[] statuses = fs.listStatus(directory, filter); if (statuses == null) { logger.info("No backup files found"); return; } Arrays.sort(statuses); int lastIndex = statuses.length - backupNumber; for (int i = 0; i < lastIndex; ++i) { logger.info("Deleting " + statuses[i].getPath()); fs.delete(statuses[i].getPath(), true); } }
From source file:wikiduper.clir.rp.TextDocnoMappingBuilder.java
License:Apache License
@Override public int run(String[] args) throws IOException { DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args); System.out.println("WTF WTF args: " + Arrays.toString(args)); if (options == null) { return -1; }/* w ww . ja v a 2 s .c o m*/ // Temp directory. String tmpDir = "tmp-" + TextDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000); LOG.info("Tool name: " + TextDocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - input path: " + options.collection); LOG.info(" - output file: " + options.docnoMapping); Job job = Job.getInstance(getConf()); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(TextDocnoMappingBuilder.class); job.setNumReduceTasks(1); PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; // Note: Gov2 and Wt10g raw collections are organized into sub-directories. Path collectionPath = new Path(options.collection); for (FileStatus status : fs.listStatus(collectionPath, filter)) { if (status.isDirectory()) { for (FileStatus s : fs.listStatus(status.getPath(), filter)) { FileInputFormat.addInputPath(job, s.getPath()); } } else { FileInputFormat.addInputPath(job, status.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(TextInputFormat.class); //options.inputFormat); LOG.info("Input format : " + options.inputFormat); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); LOG.info("Here1\n"); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpDir), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs); fs.delete(new Path(tmpDir), true); return 0; }