Example usage for org.apache.mahout.common HadoopUtil delete

Introduction

In this page you can find the example usage for org.apache.mahout.common HadoopUtil delete.

Prototype

public static void delete(Configuration conf, Path... paths) throws IOException

Source Link

Usage

From source file:DisplayKMeans.java

License:Apache License

public static void main(String[] args) throws Exception {
    DistanceMeasure measure = new ManhattanDistanceMeasure();
    Path samples = new Path("samples");
    Path output = new Path("output");
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, samples);
    HadoopUtil.delete(conf, output);// ww w  .ja  va  2  s.c om

    RandomUtils.useTestSeed();
    generateSamples();
    writeSampleData(samples);
    boolean runClusterer = true;
    double convergenceDelta = 0.001;
    int numClusters = 2;
    int maxIterations = 10;
    if (runClusterer) {
        runSequentialKMeansClusterer(conf, samples, output, measure, numClusters, maxIterations,
                convergenceDelta);
    } else {
        runSequentialKMeansClassifier(conf, samples, output, measure, numClusters, maxIterations,
                convergenceDelta);
    }
    new DisplayKMeans();
}

From source file:DisplayFuzzyKMeans.java

License:Apache License

public static void main(String[] args) throws Exception {
    DistanceMeasure measure = new ManhattanDistanceMeasure();

    Path samples = new Path("samples");
    Path output = new Path("output");
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, output);
    HadoopUtil.delete(conf, samples);/*from w w  w. ja  v a2s  . c  o  m*/
    RandomUtils.useTestSeed();
    DisplayClustering.generateSamples();
    writeSampleData(samples);
    boolean runClusterer = true;
    int maxIterations = 10;
    float threshold = 0.001F;
    float m = 1.1F;
    if (runClusterer) {
        runSequentialFuzzyKClusterer(conf, samples, output, measure, maxIterations, m, threshold);
    } else {
        int numClusters = 3;
        runSequentialFuzzyKClassifier(conf, samples, output, measure, numClusters, maxIterations, m, threshold);
    }
    new DisplayFuzzyKMeans();
}

From source file:chapter5.KMeanSample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path output = new Path("output");
    Configuration conf = new Configuration();
    HadoopUtil.delete(conf, output);
    run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
}

From source file:chapter5.KMeanSample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from ww  w .j  a  va2 s . c o m*/
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.numClustersOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.convergenceOption().create());
    addOption(DefaultOptionCreator.maxIterationsOption().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, String> argMap = parseArguments(args);
    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    if (measureClass == null) {
        measureClass = SquaredEuclideanDistanceMeasure.class.getName();
    }
    double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
    int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), output);
    }
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
        int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
        run(getConf(), input, output, measure, k, convergenceDelta, maxIterations);
    } else {
        double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
        double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
        run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations);
    }
    return 0;
}

From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length > 0) {
        log.info("Running with only user-supplied arguments");
        ToolRunner.run(new Configuration(), new CanopyClustering(), args);
    } else {/*from  w w w. j av  a 2 s .  c o m*/
        log.info("Running with default arguments");
        Path output = new Path("output");
        HadoopUtil.delete(new Configuration(), output);
        run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 55);
    }
}

From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//w  w  w .  j a  v  a2s . c  om
    addOutputOption();
    addOption(DefaultOptionCreator.distanceMeasureOption().create());
    addOption(DefaultOptionCreator.t1Option().create());
    addOption(DefaultOptionCreator.t2Option().create());
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> argMap = parseArguments(args);
    if (argMap == null) {
        return -1;
    }

    Path input = getInputPath();
    Path output = getOutputPath();
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(new Configuration(), output);
    }
    String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
    DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);

    run(input, output, measure, t1, t2);
    return 0;
}

From source file:com.caseystella.ingest.SparseVectorsFromSequenceFiles.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();
    Option libJarsOpt = obuilder.withLongName("libjars")
            .withArgument(abuilder.withName("libjars").withMinimum(1).withMaximum(1).create())
            .withDescription("The default arg for libjars").withShortName("libjars").create();
    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(//w ww  . ja  v a2  s. c  om
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(libJarsOpt).withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt)
            .withOption(minDFOpt).withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt)
            .withOption(powerOpt).withOption(minLLROpt).withOption(numReduceTasksOpt)
            .withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt)
            .withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt).withOption(logNormalizeOpt)
            .create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }

        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.FPGrowthDriver.java

License:Apache License

/**
 * Run TopK FPGrowth given the input file,
 *//*from w  w  w  .  j a v a 2 s.  c om*/
@Override
public int run(String[] args) throws Exception {
    addInputOption();
    addOutputOption();

    addOption("minSupport", "s",
            "(Optional) The minimum number of times a co-occurrence must be present." + " Default Value: 3",
            "3");
    addOption("maxHeapSize", "k",
            "(Optional) Maximum Heap Size k, to denote the requirement to mine top K items."
                    + " Default value: 50",
            "50");
    addOption("numGroups", "g",
            "(Optional) Number of groups the features should be divided in the map-reduce version."
                    + " Doesn't work in sequential version Default Value:" + PFPGrowth.NUM_GROUPS_DEFAULT,
            Integer.toString(PFPGrowth.NUM_GROUPS_DEFAULT));
    addOption("splitterPattern", "regex",
            "Regular Expression pattern used to split given string transaction into"
                    + " itemsets. Default value splits comma separated itemsets.  Default Value:"
                    + " \"[ ,\\t]*[,|\\t][ ,\\t]*\" ",
            "[ ,\t]*[,|\t][ ,\t]*");
    addOption("numTreeCacheEntries", "tc", "(Optional) Number of entries in the tree cache to prevent duplicate"
            + " tree building. (Warning) a first level conditional FP-Tree might consume a lot of memory, "
            + "so keep this value small, but big enough to prevent duplicate tree building. "
            + "Default Value:5 Recommended Values: [5-10]", "5");
    addOption("method", "method", "Method of processing: sequential|mapreduce", "sequential");
    addOption("encoding", "e", "(Optional) The file encoding.  Default value: UTF-8", "UTF-8");
    addFlag("useFPG2", "2", "Use an alternate FPG implementation");

    if (parseArguments(args) == null) {
        return -1;
    }

    Parameters params = new Parameters();

    if (hasOption("minSupport")) {
        String minSupportString = getOption("minSupport");
        params.set("minSupport", minSupportString);
    }
    if (hasOption("maxHeapSize")) {
        String maxHeapSizeString = getOption("maxHeapSize");
        params.set("maxHeapSize", maxHeapSizeString);
    }
    if (hasOption("numGroups")) {
        String numGroupsString = getOption("numGroups");
        params.set("numGroups", numGroupsString);
    }

    if (hasOption("numTreeCacheEntries")) {
        String numTreeCacheString = getOption("numTreeCacheEntries");
        params.set("treeCacheSize", numTreeCacheString);
    }

    if (hasOption("splitterPattern")) {
        String patternString = getOption("splitterPattern");
        params.set("splitPattern", patternString);
    }

    String encoding = "UTF-8";
    if (hasOption("encoding")) {
        encoding = getOption("encoding");
    }
    params.set("encoding", encoding);

    if (hasOption("useFPG2")) {
        params.set(PFPGrowth.USE_FPG2, "true");
    }

    Path inputDir = getInputPath();
    Path outputDir = getOutputPath();

    params.set("input", inputDir.toString());
    params.set("output", outputDir.toString());

    String classificationMethod = getOption("method");
    if ("sequential".equalsIgnoreCase(classificationMethod)) {
        runFPGrowth(params);
    } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) {
        Configuration conf = new Configuration();
        HadoopUtil.delete(conf, outputDir);
        PFPGrowth.runPFPGrowth(params);
    }

    return 0;
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * Serializes the fList and returns the string representation of the List
 *///from w  w  w .j  a  v  a2s .c  om
public static void saveFList(Iterable<Pair<String, Long>> flist, Parameters params, Configuration conf)
        throws IOException {
    Path flistPath = new Path(params.get(OUTPUT), F_LIST);
    FileSystem fs = FileSystem.get(flistPath.toUri(), conf);
    flistPath = fs.makeQualified(flistPath);
    HadoopUtil.delete(conf, flistPath);
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class);
    try {
        for (Pair<String, Long> pair : flist) {
            writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond()));
        }
    } finally {
        writer.close();
    }
    DistributedCache.addCacheFile(flistPath.toUri(), conf);
}

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java

License:Apache License

/**
 * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the features
 * present in it and thus calculate the final Top K frequent Patterns for each feature
 *///w  ww .  j a  v a2s. co m
public static void startAggregating(Parameters params, Configuration conf)
        throws IOException, InterruptedException, ClassNotFoundException {

    conf.set(PFP_PARAMETERS, params.toString());
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input = new Path(params.get(OUTPUT), FPGROWTH);
    Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input);
    job.setJarByClass(PFPGrowth.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TopKStringPatterns.class);

    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);
    FileOutputFormat.setOutputPath(job, outPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(AggregatorMapper.class);
    job.setCombinerClass(AggregatorReducer.class);
    job.setReducerClass(AggregatorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, outPath);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}