Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:tv.icntv.recommend.algorithm.CorrelateJob.java

License:Apache License

/**
 * new String[]{// ww w. ja  va 2  s.com
 String.format(configuration.get(sourceProperty),date),
 middleDirectory,
 sb.toString(),
 String.format(configuration.get(targetResultProperty),date)
 }
 * @param strings
 * @return
 * @throws Exception
 */
@Override
public int run(String[] strings) throws Exception {
    Configuration configuration = getConf();
    Date date = getDateAdd(-1);

    String middleDirectory = String.format(configuration.get(correlateInputProperty), date);
    StringBuilder sb = new StringBuilder();
    sb.append("minSupport=").append(configuration.get(minSupportProperty, "3")).append(split)
            .append("maxHeapSize=1024").append(split).append("splitterPattern='[\t ]'").append(split)
            .append("input=").append(middleDirectory).append(split).append("output=")
            .append(String.format(configuration.get(fpGrowthProperty), date));

    HadoopUtils.deleteIfExist(middleDirectory);
    Job correlate = new Job(configuration, "???fp-growth");
    MapReduceUtils.initMapperJob(UserHistoryMapper.class, Text.class, Text.class, this.getClass(), correlate,
            getInput(configuration, -1));//new Path(String.format(configuration.get(sourceProperty),date))
    //        MapReduceUtils.initReducerJob(new Path(middleDirectory), UserHistoryReducer.class, correlate);
    correlate.setReducerClass(UserHistoryReducer.class);
    correlate.setOutputKeyClass(NullWritable.class);
    correlate.setOutputValueClass(Text.class);
    //        correlate.setCombinerClass(UserHistoryCombiner.class);
    FileOutputFormat.setOutputPath(correlate, new Path(middleDirectory));
    if (!correlate.waitForCompletion(true)) {
        return 1;
    }
    ;
    Parameters parameter = getParameter(sb.toString());
    HadoopUtils.deleteIfExist(parameter.get("output"));
    PFPGrowth.runPFPGrowth(parameter, configuration);
    String output = parameter.get("output") + "/frequentpatterns";
    long count = HadoopUtils.count(new Path(output), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().matches("part-r-\\d*");
        }
    });
    if (count == 0) {
        return 1;
    }
    String resultPath = String.format(configuration.get(targetResultProperty), date);
    configuration.setLong("icntv.correlate.total.size", count);
    HadoopUtils.deleteIfExist(resultPath);
    Job result = new Job(configuration, "?");
    MapReduceUtils.initMapperJob(CorrelateInputMapper.class, Text.class, Text.class, this.getClass(), result,
            new Path(output));
    result.setInputFormatClass(SequenceFileInputFormat.class);

    MapReduceUtils.initReducerJob(new Path(resultPath), CorrelateOutPutReducer.class, result);
    result.waitForCompletion(true);
    return 0;
}

From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job//ww  w. j a v a  2 s  .  c om
 */
public void build() {
    try {
        Job job = new Job(config);
        job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        job.getConfiguration().set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        job.getConfiguration().setBoolean("save.keys", saveKeys);
        job.getConfiguration().set("final.output.dir", outputDir.toString());
        job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType));
        job.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
        job.setMapperClass(mapperClass);
        job.setMapOutputKeyClass(BytesWritable.class);
        job.setMapOutputValueClass(BytesWritable.class);
        job.setReducerClass(HadoopStoreBuilderReducer.class);
        job.setInputFormatClass(inputFormatClass);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setJarByClass(getClass());

        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration());
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration());
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        int numChunks = Math.max(
                (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes),
                1);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes
                + ",  num.chunks = " + numChunks);
        job.getConfiguration().setInt("num.chunks", numChunks);
        int numReduces = cluster.getNumberOfNodes() * numChunks;
        job.setNumReduceTasks(numReduces);
        logger.info("Number of reduces: " + numReduces);

        logger.info("Building store...");
        job.waitForCompletion(true);

        ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();
        if (saveKeys)
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
        else
            metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {
            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());
            if (!outputFs.exists(nodePath)) {
                outputFs.mkdirs(nodePath); // Create empty folder
            }

            // Write metadata
            FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata"));
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();
        }

        if (checkSumType != CheckSumType.NONE) {

            // Generate checksum for every node
            FileStatus[] nodes = outputFs.listStatus(outputDir);

            // Do a CheckSumOfCheckSum - Similar to HDFS
            CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
            if (checkSumGenerator == null) {
                throw new VoldemortException("Could not generate checksum digests");
            }

            for (FileStatus node : nodes) {
                if (node.isDir()) {
                    FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() {

                        public boolean accept(Path arg0) {
                            if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                                return true;
                            }
                            return false;
                        }
                    });

                    if (storeFiles != null) {
                        Arrays.sort(storeFiles, new IndexFileLastComparator());
                        for (FileStatus file : storeFiles) {
                            FSDataInputStream input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            checkSumGenerator.update(fileCheckSum);
                            outputFs.delete(file.getPath(), true);
                        }
                        FSDataOutputStream checkSumStream = outputFs.create(
                                new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt"));
                        checkSumStream.write(checkSumGenerator.getCheckSum());
                        checkSumStream.flush();
                        checkSumStream.close();

                    }
                }
            }
        }
    } catch (Exception e) {
        logger.error("Error = " + e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readonly.mr.azkaban.AbstractHadoopJob.java

License:Apache License

public JobConf createJobConf(Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass)
        throws IOException, URISyntaxException {
    JobConf conf = new JobConf();
    // set custom class loader with custom find resource strategy.

    conf.setJobName(getId());//from ww w .  jav a2s .co m
    conf.setMapperClass(mapperClass);
    conf.setReducerClass(reducerClass);

    String hadoop_ugi = _props.getString("hadoop.job.ugi", null);
    if (hadoop_ugi != null) {
        conf.set("hadoop.job.ugi", hadoop_ugi);
    }

    if (_props.getBoolean("is.local", false)) {
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.local.dir", "/tmp/map-red");

        info("Running locally, no hadoop jar set.");
    } else {
        setClassLoaderAndJar(conf, getClass());
        info("Setting hadoop jar file for class:" + getClass() + "  to " + conf.getJar());
        info("*************************************************************************");
        info("          Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ")           ");
        info("*************************************************************************");
    }

    // set JVM options if present
    if (_props.containsKey("mapred.child.java.opts")) {
        conf.set("mapred.child.java.opts", _props.getString("mapred.child.java.opts"));
        info("mapred.child.java.opts set to " + _props.getString("mapred.child.java.opts"));
    }

    // set input and output paths if they are present
    if (_props.containsKey("input.paths")) {
        List<String> inputPaths = _props.getStringList("input.paths");
        if (inputPaths.size() == 0)
            throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'");
        for (String path : inputPaths) {
            // Implied stuff, but good implied stuff
            if (path.endsWith(LATEST_SUFFIX)) {
                FileSystem fs = FileSystem.get(conf);

                PathFilter filter = new PathFilter() {

                    @Override
                    public boolean accept(Path arg0) {
                        return !arg0.getName().startsWith("_") && !arg0.getName().startsWith(".");
                    }
                };

                String latestPath = path.substring(0, path.length() - LATEST_SUFFIX.length());
                FileStatus[] statuses = fs.listStatus(new Path(latestPath), filter);

                Arrays.sort(statuses);

                path = statuses[statuses.length - 1].getPath().toString();
                System.out.println("Using latest folder: " + path);
            }
            HadoopUtils.addAllSubPaths(conf, new Path(path));
        }
    }

    if (_props.containsKey("output.path")) {
        String location = _props.get("output.path");
        if (location.endsWith("#CURRENT")) {
            DateTimeFormatter format = DateTimeFormat.forPattern(COMMON_FILE_DATE_PATTERN);
            String destPath = format.print(new DateTime());
            location = location.substring(0, location.length() - "#CURRENT".length()) + destPath;
            System.out.println("Store location set to " + location);
        }

        FileOutputFormat.setOutputPath(conf, new Path(location));
        // For testing purpose only remove output file if exists
        if (_props.getBoolean("force.output.overwrite", false)) {
            FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
            fs.delete(FileOutputFormat.getOutputPath(conf), true);
        }
    }

    // Adds External jars to hadoop classpath
    String externalJarList = _props.getString("hadoop.external.jarFiles", null);
    if (externalJarList != null) {
        String[] jarFiles = externalJarList.split(",");
        for (String jarFile : jarFiles) {
            info("Adding extenral jar File:" + jarFile);
            DistributedCache.addFileToClassPath(new Path(jarFile), conf);
        }
    }

    // Adds distributed cache files
    String cacheFileList = _props.getString("hadoop.cache.files", null);
    if (cacheFileList != null) {
        String[] cacheFiles = cacheFileList.split(",");
        for (String cacheFile : cacheFiles) {
            info("Adding Distributed Cache File:" + cacheFile);
            DistributedCache.addCacheFile(new URI(cacheFile), conf);
        }
    }

    // Adds distributed cache files
    String archiveFileList = _props.getString("hadoop.cache.archives", null);
    if (archiveFileList != null) {
        String[] archiveFiles = archiveFileList.split(",");
        for (String archiveFile : archiveFiles) {
            info("Adding Distributed Cache Archive File:" + archiveFile);
            DistributedCache.addCacheArchive(new URI(archiveFile), conf);
        }
    }

    String hadoopCacheJarDir = _props.getString("hdfs.default.classpath.dir", null);
    if (hadoopCacheJarDir != null) {
        FileSystem fs = FileSystem.get(conf);
        if (fs != null) {
            FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir));

            if (status != null) {
                for (int i = 0; i < status.length; ++i) {
                    if (!status[i].isDir()) {
                        Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName());
                        info("Adding Jar to Distributed Cache Archive File:" + path);

                        DistributedCache.addFileToClassPath(path, conf);
                    }
                }
            } else {
                info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty.");
            }
        } else {
            info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " filesystem doesn't exist");
        }
    }

    // May want to add this to HadoopUtils, but will await refactoring
    for (String key : getProps().keySet()) {
        String lowerCase = key.toLowerCase();
        if (lowerCase.startsWith(HADOOP_PREFIX)) {
            String newKey = key.substring(HADOOP_PREFIX.length());
            conf.set(newKey, getProps().get(key));
        }
    }

    HadoopUtils.setPropsInJob(conf, getProps());
    return conf;
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java

License:Apache License

/**
 * Run the job/* w  w w  .ja va 2  s.c o m*/
 */
public void build() {
    try {
        JobConf conf = new JobConf(config);
        conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
        conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster));
        conf.set("stores.xml",
                new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef)));
        conf.setBoolean("save.keys", saveKeys);
        conf.setBoolean("reducer.per.bucket", reducerPerBucket);
        if (!isAvro) {
            conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
            conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(BytesWritable.class);
            conf.setMapOutputValueClass(BytesWritable.class);
            if (reducerPerBucket) {
                conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(HadoopStoreBuilderReducer.class);
            }
        }
        conf.setInputFormat(inputFormatClass);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        conf.setOutputKeyClass(BytesWritable.class);
        conf.setOutputValueClass(BytesWritable.class);
        conf.setJarByClass(getClass());
        conf.setReduceSpeculativeExecution(false);
        FileInputFormat.setInputPaths(conf, inputPath);
        conf.set("final.output.dir", outputDir.toString());
        conf.set("checksum.type", CheckSum.toString(checkSumType));
        FileOutputFormat.setOutputPath(conf, tempDir);

        FileSystem outputFs = outputDir.getFileSystem(conf);
        if (outputFs.exists(outputDir)) {
            throw new IOException("Final output directory already exists.");
        }

        // delete output dir if it already exists
        FileSystem tempFs = tempDir.getFileSystem(conf);
        tempFs.delete(tempDir, true);

        long size = sizeOfPath(tempFs, inputPath);
        logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor()
                + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes);

        // Derive "rough" number of chunks and reducers
        int numReducers;
        if (saveKeys) {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes),
                        1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor();
            } else {
                numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks;
            }
        } else {

            if (this.numChunks == -1) {
                this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size
                        / cluster.getNumberOfPartitions() / chunkSizeBytes), 1);
            } else {
                logger.info(
                        "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly");
            }

            if (reducerPerBucket) {
                numReducers = cluster.getNumberOfPartitions();
            } else {
                numReducers = cluster.getNumberOfPartitions() * numChunks;
            }
        }
        conf.setInt("num.chunks", numChunks);
        conf.setNumReduceTasks(numReducers);

        if (isAvro) {
            conf.setPartitionerClass(AvroStoreBuilderPartitioner.class);
            // conf.setMapperClass(mapperClass);
            conf.setMapOutputKeyClass(ByteBuffer.class);
            conf.setMapOutputValueClass(ByteBuffer.class);

            conf.setInputFormat(inputFormatClass);

            conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class);
            conf.setOutputKeyClass(ByteBuffer.class);
            conf.setOutputValueClass(ByteBuffer.class);

            // AvroJob confs for the avro mapper
            AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema")));

            AvroJob.setOutputSchema(conf,
                    Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES)));

            AvroJob.setMapperClass(conf, mapperClass);

            if (reducerPerBucket) {
                conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class);
            } else {
                conf.setReducerClass(AvroStoreBuilderReducer.class);
            }

        }

        logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: "
                + saveKeys + ", reducerPerBucket: " + reducerPerBucket);
        logger.info("Building store...");
        RunningJob job = JobClient.runJob(conf);

        // Once the job has completed log the counter
        Counters counters = job.getCounters();

        if (saveKeys) {
            if (reducerPerBucket) {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            } else {
                logger.info("Number of collisions in the job - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS));
                logger.info("Maximum number of collisions for one entry - "
                        + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS));
            }
        }

        // Do a CheckSumOfCheckSum - Similar to HDFS
        CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType);
        if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) {
            throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType);
        }

        // Check if all folder exists and with format file
        for (Node node : cluster.getNodes()) {

            ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata();

            if (saveKeys) {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode());
            } else {
                metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode());
            }

            Path nodePath = new Path(outputDir.toString(), "node-" + node.getId());

            if (!outputFs.exists(nodePath)) {
                logger.info("No data generated for node " + node.getId() + ". Generating empty folder");
                outputFs.mkdirs(nodePath); // Create empty folder
                outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION));
                logger.info("Setting permission to 755 for " + nodePath);
            }

            if (checkSumType != CheckSumType.NONE) {

                FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() {

                    public boolean accept(Path arg0) {
                        if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) {
                            return true;
                        }
                        return false;
                    }
                });

                if (storeFiles != null && storeFiles.length > 0) {
                    Arrays.sort(storeFiles, new IndexFileLastComparator());
                    FSDataInputStream input = null;

                    for (FileStatus file : storeFiles) {
                        try {
                            input = outputFs.open(file.getPath());
                            byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)];
                            input.read(fileCheckSum);
                            logger.debug("Checksum for file " + file.toString() + " - "
                                    + new String(Hex.encodeHex(fileCheckSum)));
                            checkSumGenerator.update(fileCheckSum);
                        } catch (Exception e) {
                            logger.error("Error while reading checksum file " + e.getMessage(), e);
                        } finally {
                            if (input != null)
                                input.close();
                        }
                        outputFs.delete(file.getPath(), false);
                    }

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType));

                    String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum()));
                    logger.info("Checksum for node " + node.getId() + " - " + checkSum);

                    metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum);
                }
            }

            // Write metadata
            Path metadataPath = new Path(nodePath, ".metadata");
            FSDataOutputStream metadataStream = outputFs.create(metadataPath);
            outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION));
            logger.info("Setting permission to 755 for " + metadataPath);
            metadataStream.write(metadata.toJsonString().getBytes());
            metadataStream.flush();
            metadataStream.close();

        }

    } catch (Exception e) {
        logger.error("Error in Store builder", e);
        throw new VoldemortException(e);
    }

}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java

License:Apache License

/**
 * Given a filesystem and path to a node, gets all the data files (
 * irrespective of partition, replica, etc )
 * /*from   www.j ava  2  s .c  o m*/
 * Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
 * 
 * @param fs Underlying filesystem
 * @param path The node directory path
 * @return Returns list of files of this partition, replicaType
 * @throws IOException
 */
public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path) throws IOException {
    return fs.listStatus(path, new PathFilter() {

        public boolean accept(Path input) {
            if (input.getName().matches("^[\\d]+_[\\d]+_[\\d]+\\.data")) {
                return true;
            } else {
                return false;
            }
        }
    });
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java

License:Apache License

/**
 * Given a filesystem and path to a node, gets all the files which belong to
 * a partition and replica type//ww w .j  ava 2  s .co  m
 * 
 * Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
 * 
 * @param fs Underlying filesystem
 * @param path The node directory path
 * @param partitionId The partition id for which we get the files
 * @param replicaType The replica type
 * @return Returns list of files of this partition, replicaType
 * @throws IOException
 */
public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path, final int partitionId,
        final int replicaType) throws IOException {
    return fs.listStatus(path, new PathFilter() {

        public boolean accept(Path input) {
            if (input.getName().matches("^" + Integer.toString(partitionId) + "_"
                    + Integer.toString(replicaType) + "_[\\d]+\\.data")) {
                return true;
            } else {
                return false;
            }
        }
    });
}

From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java

License:Apache License

/**
 * Given a filesystem and path to a node, gets all the files which belong to
 * a partition, replica type and chunk id
 * //from   w w w  .j  a v  a  2 s . c om
 * Works only for {@link ReadOnlyStorageFormat.READONLY_V2}
 * 
 * @param fs Underlying filesystem
 * @param path The node directory path
 * @param partitionId The partition id for which we get the files
 * @param replicaType The replica type
 * @param chunkId The chunk id
 * @return Returns list of files of this partition, replicaType, chunkId
 * @throws IOException
 */
public static FileStatus[] getDataChunkFiles(FileSystem fs, Path path, final int partitionId,
        final int replicaType, final int chunkId) throws IOException {
    return fs.listStatus(path, new PathFilter() {

        public boolean accept(Path input) {
            if (input.getName().matches("^" + Integer.toString(partitionId) + "_"
                    + Integer.toString(replicaType) + "_" + Integer.toString(chunkId) + "\\.data")) {
                return true;
            } else {
                return false;
            }
        }
    });
}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Looks for the latest (the alphabetically greatest) path contained in the
 * given directory that passes the specified regex pattern.
 * /*from   ww  w  . j  a va 2  s .  c o  m*/
 * @param fs The file system
 * @param directory The directory that will contain the versions
 * @param acceptRegex The String pattern
 * @return
 * @throws IOException
 */
public static Path getLatestVersionedPath(FileSystem fs, Path directory, String acceptRegex)
        throws IOException {
    final String pattern = acceptRegex != null ? acceptRegex : "\\S+";

    PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path arg0) {
            return !arg0.getName().startsWith("_") && Pattern.matches(pattern, arg0.getName());
        }
    };

    FileStatus[] statuses = fs.listStatus(directory, filter);

    if (statuses == null || statuses.length == 0) {
        return null;
    }

    Arrays.sort(statuses);

    return statuses[statuses.length - 1].getPath();
}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

/**
 * Easily cleans up old data (alphabetically least) paths that is accepted
 * by the regex.//from  w w w  . j a va2  s. c o  m
 * 
 * @param fs The file system
 * @param directory The directory that will contain the versions
 * @param acceptRegex The String pattern
 * @param backupNumber The number of versions we should keep. Otherwise
 *        we'll clean up.
 * @return
 * @throws IOException
 */
public static void cleanupOlderVersions(FileSystem fs, Path directory, final String acceptRegex,
        int backupNumber) throws IOException {
    if (backupNumber < 1) {
        logger.error("Number of versions must be 1 or greater");
        return;
    }

    PathFilter filter = new PathFilter() {

        @Override
        public boolean accept(Path arg0) {
            return !arg0.getName().startsWith("_") && Pattern.matches(acceptRegex, arg0.getName());
        }
    };

    FileStatus[] statuses = fs.listStatus(directory, filter);
    if (statuses == null) {
        logger.info("No backup files found");
        return;
    }

    Arrays.sort(statuses);

    int lastIndex = statuses.length - backupNumber;
    for (int i = 0; i < lastIndex; ++i) {
        logger.info("Deleting " + statuses[i].getPath());
        fs.delete(statuses[i].getPath(), true);
    }
}

From source file:wikiduper.clir.rp.TextDocnoMappingBuilder.java

License:Apache License

@Override
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    System.out.println("WTF WTF args: " + Arrays.toString(args));
    if (options == null) {
        return -1;
    }/*  w  ww  .  ja  v a 2 s .c o m*/

    // Temp directory.
    String tmpDir = "tmp-" + TextDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool name: " + TextDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = Job.getInstance(getConf());

    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(TextDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    PathFilter filter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    // Note: Gov2 and Wt10g raw collections are organized into sub-directories.
    Path collectionPath = new Path(options.collection);
    for (FileStatus status : fs.listStatus(collectionPath, filter)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(TextInputFormat.class); //options.inputFormat);
    LOG.info("Input format : " + options.inputFormat);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    LOG.info("Here1\n");
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
    fs.delete(new Path(tmpDir), true);

    return 0;
}