Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:tv.icntv.recommend.algorithm.CorrelateJob.java

License:Apache License

/**
 * new String[]{// ww w. ja  va 2  s.com
 String.format(configuration.get(sourceProperty),date),
 middleDirectory,
 sb.toString(),
 String.format(configuration.get(targetResultProperty),date)
 }
 * @param strings
 * @return
 * @throws Exception
 */
@Override
public int run(String[] strings) throws Exception {
    Configuration configuration = getConf();
    Date date = getDateAdd(-1);

    String middleDirectory = String.format(configuration.get(correlateInputProperty), date);
    StringBuilder sb = new StringBuilder();
    sb.append("minSupport=").append(configuration.get(minSupportProperty, "3")).append(split)
            .append("maxHeapSize=1024").append(split).append("splitterPattern='[\t ]'").append(split)
            .append("input=").append(middleDirectory).append(split).append("output=")
            .append(String.format(configuration.get(fpGrowthProperty), date));

    HadoopUtils.deleteIfExist(middleDirectory);
    Job correlate = new Job(configuration, "???fp-growth");
    MapReduceUtils.initMapperJob(UserHistoryMapper.class, Text.class, Text.class, this.getClass(), correlate,
            getInput(configuration, -1));//new Path(String.format(configuration.get(sourceProperty),date))
    //        MapReduceUtils.initReducerJob(new Path(middleDirectory), UserHistoryReducer.class, correlate);
    correlate.setReducerClass(UserHistoryReducer.class);
    correlate.setOutputKeyClass(NullWritable.class);
    correlate.setOutputValueClass(Text.class);
    //        correlate.setCombinerClass(UserHistoryCombiner.class);
    FileOutputFormat.setOutputPath(correlate, new Path(middleDirectory));
    if (!correlate.waitForCompletion(true)) {
        return 1;
    }
    ;
    Parameters parameter = getParameter(sb.toString());
    HadoopUtils.deleteIfExist(parameter.get("output"));
    PFPGrowth.runPFPGrowth(parameter, configuration);
    String output = parameter.get("output") + "/frequentpatterns";
    long count = HadoopUtils.count(new Path(output), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().matches("part-r-\\d*");
        }
    });
    if (count == 0) {
        return 1;
    }
    String resultPath = String.format(configuration.get(targetResultProperty), date);
    configuration.setLong("icntv.correlate.total.size", count);
    HadoopUtils.deleteIfExist(resultPath);
    Job result = new Job(configuration, "?");
    MapReduceUtils.initMapperJob(CorrelateInputMapper.class, Text.class, Text.class, this.getClass(), result,
            new Path(output));
    result.setInputFormatClass(SequenceFileInputFormat.class);

    MapReduceUtils.initReducerJob(new Path(resultPath), CorrelateOutPutReducer.class, result);
    result.waitForCompletion(true);
    return 0;
}

From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job//ww  w. j a v a  2 s  .  c om
 */
public void build() {
    try {
        Job job = new Job(config);
        job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        job.getConfiguration().set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        job.getConfiguration().setBoolean("save.keys", saveKeys);
        job.getConfiguration().set("final.output.dir", outputDir.toString());
        job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType));
        job.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
        job.setMapperClass(mapperClass);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(BytesWritable.class);
        job.setReducerClass(HadoopStoreBuilderReducer.class);
        job.setInputFormatClass(inputFormatClass);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setJarByClass(getClass());

        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration());
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration());
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        int numChunks = Math.max(
                (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes),
                1);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes
                + ",  num.chunks = " + numChunks);
        job.getConfiguration().setInt("num.chunks", numChunks);
        int numReduces = cluster.getNumberOfNodes() * numChunks;
        job.setNumReduceTasks(numReduces);
        logger.info("Number of reduces: " + numReduces);

        logger.info("Building store...");
        job.waitForCompletion(true);

        ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();
        if (saveKeys)
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
        else
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {
            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());
            if (!outputFs.exists(nodePath)) {
                outputFs.mkdirs(nodePath); // Create empty folder
            }

            // Write metadata
            FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata"));
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();
        }

        if (checkSumType != CheckSumType.NONE) {

            // Generate checksum for every node
            FileStatus[] nodes = outputFs.listStatus(outputDir);

            // Do a CheckSumOfCheckSum - Similar to HDFS
            CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
            if (checkSumGenerator == null) {
                throw new VoldemortException("Could not generate checksum digests");
            }

            for (FileStatus node : nodes) {
                if (node.isDir()) {
                    FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() {

                        public boolean accept(Path arg0) {
                            if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                                return true;
                            }
                            return false;
                        }
                    });

                    if (storeFiles != null) {
                        Arrays.sort(storeFiles, new IndexFileLastComparator());
                        for (FileStatus file : storeFiles) {
                            FSDataInputStream input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            checkSumGenerator.update(fileCheckSum);
                            outputFs.delete(file.getPath(), true);
                        }
                        FSDataOutputStream checkSumStream = outputFs.create(
                                new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt"));
                        checkSumStream.write(checkSumGenerator.getCheckSum());
                        checkSumStream.flush();
                        checkSumStream.close();

                    }
                }
            }
        }
    } catch (Exception e) {
        logger.error("Error = " + e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readonly.mr.azkaban.AbstractHadoopJob.java

License:Apache License

public JobConf createJobConf(Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass)
        throws IOException, URISyntaxException {
    JobConf conf = new JobConf();
    // set custom class loader with custom find resource strategy.

    conf.setJobName(getId());//from ww w .  jav a2s .co m
    conf.setMapperClass(mapperClass);
    conf.setReducerClass(reducerClass);

    String hadoop_ugi = _props.getString("hadoop.job.ugi", null);
    if (hadoop_ugi != null) {
        conf.set("hadoop.job.ugi", hadoop_ugi);
    }

    if (_props.getBoolean("is.local", false)) {
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.local.dir", "/tmp/map-red");

        info("Running locally, no hadoop jar set.");
    } else {
        setClassLoaderAndJar(conf, getClass());
        info("Setting hadoop jar file for class:" + getClass() + "  to " + conf.getJar());
        info("*************************************************************************");
        info("          Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ")           ");
        info("*************************************************************************");
    }

    // set JVM options if present
    if (_props.containsKey("mapred.child.java.opts")) {
        conf.set("mapred.child.java.opts", _props.getString("mapred.child.java.opts"));
        info("mapred.child.java.opts set to " + _props.getString("mapred.child.java.opts"));
    }

    // set input and output paths if they are present
    if (_props.containsKey("input.paths")) {
        List<String> inputPaths = _props.getStringList("input.paths");
        if (inputPaths.size() == 0)
            throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'");
        for (String path : inputPaths) {
            // Implied stuff, but good implied stuff
            if (path.endsWith(LATEST_SUFFIX)) {
                FileSystem fs = FileSystem.get(conf);

                PathFilter filter = new PathFilter() {

                    @Override
                    public boolean accept(Path arg0) {
                        return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
                    }
                };

                String latestPath = path.substring(0, path.length() - LATEST_SUFFIX.length());
                FileStatus[] statuses = fs.listStatus(new Path(latestPath), filter);

                Arrays.sort(statuses);

                path = statuses[statuses.length - 1].getPath().toString();
                System.out.println("Using latest folder: " + path);
            }
            HadoopUtils.addAllSubPaths(conf, new Path(path));
        }
    }

    if (_props.containsKey("output.path")) {
        String location = _props.get("output.path");
        if (location.endsWith("#CURRENT")) {
            DateTimeFormatter format = DateTimeFormat.forPattern(COMMON_FILE_DATE_PATTERN);
            String destPath = format.print(new DateTime());
            location = location.substring(0, location.length() - "#CURRENT".length()) + destPath;
            System.out.println("Store location set to " + location);
        }

        FileOutputFormat.setOutputPath(conf, new Path(location));
        // For testing purpose only remove output file if exists
        if (_props.getBoolean("force.output.overwrite", false)) {
            FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
            fs.delete(FileOutputFormat.getOutputPath(conf), true);
        }
    }

    // Adds External jars to hadoop classpath
    String externalJarList = _props.getString("hadoop.external.jarFiles", null);
    if (externalJarList != null) {
        String[] jarFiles = externalJarList.split(",");
        for (String jarFile : jarFiles) {
            info("Adding extenral jar File:" + jarFile);
            DistributedCache.addFileToClassPath(new Path(jarFile), conf);
        }
    }

    // Adds distributed cache files
    String cacheFileList = _props.getString("hadoop.cache.files", null);
    if (cacheFileList != null) {
        String[] cacheFiles = cacheFileList.split(",");
        for (String cacheFile : cacheFiles) {
            info("Adding Distributed Cache File:" + cacheFile);
            DistributedCache.addCacheFile(new URI(cacheFile), conf);
        }
    }

    // Adds distributed cache files
    String archiveFileList = _props.getString("hadoop.cache.archives", null);
    if (archiveFileList != null) {
        String[] archiveFiles = archiveFileList.split(",");
        for (String archiveFile : archiveFiles) {
            info("Adding Distributed Cache Archive File:" + archiveFile);
            DistributedCache.addCacheArchive(new URI(archiveFile), conf);
        }
    }

    String hadoopCacheJarDir = _props.getString("hdfs.default.classpath.dir", null);
    if (hadoopCacheJarDir != null) {
        FileSystem fs = FileSystem.get(conf);
        if (fs != null) {
            FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir));

            if (status != null) {
                for (int i = 0; i < status.length; ++i) {
                    if (!status[i].isDir()) {
                        Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName());
                        info("Adding Jar to Distributed Cache Archive File:" + path);

                        DistributedCache.addFileToClassPath(path, conf);
                    }
                }
            } else {
                info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty.");
            }
        } else {
            info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " filesystem doesn't exist");
        }
    }

    // May want to add this to HadoopUtils, but will await refactoring
    for (String key : getProps().keySet()) {
        String lowerCase = key.toLowerCase();
        if (lowerCase.startsWith(HADOOP_PREFIX)) {
            String newKey = key.substring(HADOOP_PREFIX.length());
            conf.set(newKey, getProps().get(key));
        }
    }

    HadoopUtils.setPropsInJob(conf, getProps());
    return conf;
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job/* w  w w  .ja va 2  s.c o m*/
 */
public void build() {
    try {
        JobConf conf = new JobConf(config);
        conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        conf.set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        conf.setBoolean("save.keys", saveKeys);
        conf.setBoolean("reducer.per.bucket", reducerPerBucket);
        if (!isAvro) {
            conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
            conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(BytesWritable.class);
            conf.setMapOutputValueClass(BytesWritable.class);
            if (reducerPerBucket) {
                conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(HadoopStoreBuilderReducer.class);
            }
        }
        conf.setInputFormat(inputFormatClass);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(BytesWritable.class);
        conf.setOutputValueClass(BytesWritable.class);
        conf.setJarByClass(getClass());
        conf.setReduceSpeculativeExecution(false);
        FileInputFormat.setInputPaths(conf, inputPath);
        conf.set("final.output.dir", outputDir.toString());
        conf.set("checksum.type", CheckSum.toString(checkSumType));
        FileOutputFormat.setOutputPath(conf, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(conf);
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(conf);
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

        // Derive "rough" number of chunks and reducers
        int numReducers;
        if (saveKeys) {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes),
                        1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
            } else {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks;
            }
        } else {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / chunkSizeBytes), 1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions();
            } else {
                numReducers = cluster.getNumberOfPartitions() * numChunks;
            }
        }
        conf.setInt("num.chunks", numChunks);
        conf.setNumReduceTasks(numReducers);

        if (isAvro) {
            conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
            // conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(ByteBuffer.class);
            conf.setMapOutputValueClass(ByteBuffer.class);

            conf.setInputFormat(inputFormatClass);

            conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
            conf.setOutputKeyClass(ByteBuffer.class);
            conf.setOutputValueClass(ByteBuffer.class);

            // AvroJob confs for the avro mapper
            AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

            AvroJob.setOutputSchema(conf,
                    Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));

            AvroJob.setMapperClass(conf, mapperClass);

            if (reducerPerBucket) {
                conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(AvroStoreBuilderReducer.class);
            }

        }

        logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: "
                + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
        logger.info("Building store...");
        RunningJob job = JobClient.runJob(conf);

        // Once the job has completed log the counter
        Counters counters = job.getCounters();

        if (saveKeys) {
            if (reducerPerBucket) {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            } else {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            }
        }

        // Do a CheckSumOfCheckSum - Similar to HDFS
        CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
        if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
            throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType);
        }

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {

            ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

            if (saveKeys) {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
            } else {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());
            }

            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

            if (!outputFs.exists(nodePath)) {
                logger.info("No data generated for node " + node.getId() + ". Generating empty folder");
                outputFs.mkdirs(nodePath); // Create empty folder
                outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION));
                logger.info("Setting permission to 755 for " + nodePath);
            }

            if (checkSumType != CheckSumType.NONE) {

                FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                    public boolean accept(Path arg0) {
                        if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                            return true;
                        }
                        return false;
                    }
                });

                if (storeFiles != null && storeFiles.length > 0) {
                    Arrays.sort(storeFiles, new IndexFileLastComparator());
                    FSDataInputStream input = null;

                    for (FileStatus file : storeFiles) {
                        try {
                            input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            logger.debug("Checksum for file " + file.toString() + " - "
                                    + new String(Hex.encodeHex(fileCheckSum)));
                            checkSumGenerator.update(fileCheckSum);
                        } catch (Exception e) {
                            logger.error("Error while reading checksum file " + e.getMessage(), e);
                        } finally {
                            if (input != null)
                                input.close();
                        }
                        outputFs.delete(file.getPath(), false);
                    }

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));

                    String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                    logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                }
            }

            // Write metadata
            Path metadataPath = new Path(nodePath, ".metadata");
            FSDataOutputStream metadataStream = outputFs.create(metadataPath);
            outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION));
            logger.info("Setting permission to 755 for " + metadataPath);
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();

        }

    } catch (Exception e) {
        logger.error("Error in Store builder", e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java

License:Apache License

/**
 * Given a filesystem and path to a node, gets all the data files (
 * irrespective of partition, replica, etc )
 * /*from   www.j ava  2  s .c  o m*/
 * Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
 * 
 * @param fs Underlying filesystem
 * @param path The node directory path
 * @return Returns list of files of this partition, replicaType
 * @throws IOException
 */
public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path) throws IOException {
    return fs.listStatus(path, new PathFilter() {

        public boolean accept(Path input) {
            if (input.getName().matches("^[\\d]+_[\\d]+_[\\d]+\\.data")) {
                return true;
            } else {
                return false;
            }
        }
    });
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java

License:Apache License

/**
 * Given a filesystem and path to a node, gets all the files which belong to
 * a partition and replica type//ww w .j  ava 2  s .co  m
 * 
 * Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
 * 
 * @param fs Underlying filesystem
 * @param path The node directory path
 * @param partitionId The partition id for which we get the files
 * @param replicaType The replica type
 * @return Returns list of files of this partition, replicaType
 * @throws IOException
 */
public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path, final int partitionId,
        final int replicaType) throws IOException {
    return fs.listStatus(path, new PathFilter() {

        public boolean accept(Path input) {
            if (input.getName().matches("^" + Integer.toString(partitionId) + "_"
                    + Integer.toString(replicaType) + "_[\\d]+\\.data")) {
                return true;
            } else {
                return false;
            }
        }
    });
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java

License:Apache License

/**
 * Given a filesystem and path to a node, gets all the files which belong to
 * a partition, replica type and chunk id
 * //from   w w w  .j  a v  a  2 s . c om
 * Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
 * 
 * @param fs Underlying filesystem
 * @param path The node directory path
 * @param partitionId The partition id for which we get the files
 * @param replicaType The replica type
 * @param chunkId The chunk id
 * @return Returns list of files of this partition, replicaType, chunkId
 * @throws IOException
 */
public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path, final int partitionId,
        final int replicaType, final int chunkId) throws IOException {
    return fs.listStatus(path, new PathFilter() {

        public boolean accept(Path input) {
            if (input.getName().matches("^" + Integer.toString(partitionId) + "_"
                    + Integer.toString(replicaType) + "_" + Integer.toString(chunkId) + "\\.data")) {
                return true;
            } else {
                return false;
            }
        }
    });
}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Looks for the latest (the alphabetically greatest) path contained in the
 * given directory that passes the specified regex pattern.
 * /*from   ww  w  . j  a va 2  s .  c o  m*/
 * @param fs The file system
 * @param directory The directory that will contain the versions
 * @param acceptRegex The String pattern
 * @return
 * @throws IOException
 */
public static Path getLatestVersionedPath(FileSystem fs, Path directory, String acceptRegex)
        throws IOException {
    final String pattern = acceptRegex != null ? acceptRegex : "\\S+";

    PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path arg0) {
            return !arg0.getName().startsWith("_") && Pattern.matches(pattern, arg0.getName());
        }
    };

    FileStatus[] statuses = fs.listStatus(directory, filter);

    if (statuses == null || statuses.length == 0) {
        return null;
    }

    Arrays.sort(statuses);

    return statuses[statuses.length - 1].getPath();
}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Easily cleans up old data (alphabetically least) paths that is accepted
 * by the regex.//from  w w w  . j a va2  s. c o  m
 * 
 * @param fs The file system
 * @param directory The directory that will contain the versions
 * @param acceptRegex The String pattern
 * @param backupNumber The number of versions we should keep. Otherwise
 *        we'll clean up.
 * @return
 * @throws IOException
 */
public static void cleanupOlderVersions(FileSystem fs, Path directory, final String acceptRegex,
        int backupNumber) throws IOException {
    if (backupNumber < 1) {
        logger.error("Number of versions must be 1 or greater");
        return;
    }

    PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path arg0) {
            return !arg0.getName().startsWith("_") && Pattern.matches(acceptRegex, arg0.getName());
        }
    };

    FileStatus[] statuses = fs.listStatus(directory, filter);
    if (statuses == null) {
        logger.info("No backup files found");
        return;
    }

    Arrays.sort(statuses);

    int lastIndex = statuses.length - backupNumber;
    for (int i = 0; i < lastIndex; ++i) {
        logger.info("Deleting " + statuses[i].getPath());
        fs.delete(statuses[i].getPath(), true);
    }
}

From source file:wikiduper.clir.rp.TextDocnoMappingBuilder.java

License:Apache License

@Override
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    System.out.println("WTF WTF args: " + Arrays.toString(args));
    if (options == null) {
        return -1;
    }/*  w  ww  .  ja  v a 2 s .c o m*/

    // Temp directory.
    String tmpDir = "tmp-" + TextDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool name: " + TextDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = Job.getInstance(getConf());

    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(TextDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    PathFilter filter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    // Note: Gov2 and Wt10g raw collections are organized into sub-directories.
    Path collectionPath = new Path(options.collection);
    for (FileStatus status : fs.listStatus(collectionPath, filter)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TextInputFormat.class); //options.inputFormat);
    LOG.info("Input format : " + options.inputFormat);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    LOG.info("Here1\n");
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
    fs.delete(new Path(tmpDir), true);

    return 0;
}