List of usage examples for org.apache.mahout.common.commandline DefaultOptionCreator OVERWRITE_OPTION
String OVERWRITE_OPTION
To view the source code for org.apache.mahout.common.commandline DefaultOptionCreator OVERWRITE_OPTION.
Click Source Link
From source file:chapter5.KMeanSample.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*w ww . jav a 2 s . c o m*/ addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.numClustersOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, String> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); run(getConf(), input, output, measure, k, convergenceDelta, maxIterations); } else { double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations); } return 0; }
From source file:cn.macthink.hadoop.tdt.clustering.canopy.CanopyClustering.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w ww . j ava 2s . c o m addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(new Configuration(), output); } String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); run(input, output, measure, t1, t2); return 0; }
From source file:com.eniyitavsiye.mahoutx.hadoop.Job.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from www . j av a 2 s . c o m*/ addOutputOption(); addOption(DefaultOptionCreator.distanceMeasureOption().create()); addOption(DefaultOptionCreator.numClustersOption().create()); addOption(DefaultOptionCreator.t1Option().create()); addOption(DefaultOptionCreator.t2Option().create()); addOption(DefaultOptionCreator.convergenceOption().create()); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION); if (measureClass == null) { measureClass = SquaredEuclideanDistanceMeasure.class.getName(); } double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION)); int maxIterations = Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION)); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class); if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) { int k = Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)); run(getConf(), input, output, measure, k, convergenceDelta, maxIterations); } else { double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION)); double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION)); run(getConf(), input, output, measure, t1, t2, convergenceDelta, maxIterations); } return 0; }
From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w . ja v a2 s. c o m addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); SequenceFile.Reader reader = new SequenceFile.Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w . ja v a2s . com*/ addOutputOption(); addOption(addOption(DefaultOptionCreator.overwriteOption().create())); addOption("model", "m", "The path to the model built during training", true); addOption( buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false))); addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false))); addOption("labelIndex", "l", "The path to the location of the label index", true); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); } boolean complementary = hasOption("testComplementary"); boolean sequential = hasOption("runSequential"); if (sequential) { FileSystem fs = FileSystem.get(getConf()); NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf()); AbstractNaiveBayesClassifier classifier; if (complementary) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class, VectorWritable.class); Reader reader = new Reader(fs, getInputPath(), getConf()); Text key = new Text(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { writer.append(new Text(SLASH.split(key.toString())[1]), new VectorWritable(classifier.classifyFull(vw.get()))); } writer.close(); reader.close(); } else { boolean succeeded = runMapReduce(parsedArgs); if (!succeeded) { return -1; } } //load the labels Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex"))); //loop over the results and create the confusion matrix SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>( getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf()); ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT"); analyzeResults(labelMap, dirIterable, analyzer); log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer); return 0; }
From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*w w w .j a v a 2 s. c o m*/ addOutputOption(); addOption(LABELS, "l", "comma-separated list of labels to include in training", false); addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, "")); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); } Path labPath; String labPathStr = getOption(LABEL_INDEX); if (labPathStr != null) { labPath = new Path(labPathStr); } else { labPath = getTempPath(LABEL_INDEX); } long labelSize = createLabelIndex(labPath); float alphaI = Float.parseFloat(getOption(ALPHA_I)); boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY)); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); //add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } //sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } //put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- // TODO: add reference here to the part of the Rennie paper that discusses this Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); /* TODO(robinanil): Enable this when thetanormalization works. succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; }*/ //validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(getOutputPath(), getConf()); return 0; }
From source file:com.netease.news.text.SequenceFilesFromDirectory.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOptions();// w w w. ja v a2s.com addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); if (parseArguments(args) == null) { return -1; } Map<String, String> options = parseOptions(); Path output = getOutputPath(); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } if (getOption(DefaultOptionCreator.METHOD_OPTION, DefaultOptionCreator.MAPREDUCE_METHOD) .equals(DefaultOptionCreator.SEQUENTIAL_METHOD)) { runSequential(getConf(), getInputPath(), output, options); } else { runMapReduce(getInputPath(), output); } return 0; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Configure this instance based on the command-line arguments contained within provided array. * Calls {@link #validate()} to ensure consistency of configuration. * * @return true if the arguments were parsed successfully and execution should proceed. * @throws Exception if there is a problem parsing the command-line arguments or the particular * combination would violate class invariants. *//* w w w.j a va2 s . c o m*/ private boolean parseArgs(String[] args) throws Exception { addInputOption(); addOption("trainingOutput", "tr", "The training data output directory", false); addOption("testOutput", "te", "The test data output directory", false); addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false); addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false); addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file " + "size (0=start, 50=middle, 100=end", false); addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false); addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using " + "mapreduce mode", false); addOption("charset", "c", "The name of the character encoding of the input files (not needed if using " + "SequenceFiles)", false); addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files. Default is false", false, false, "false")); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.overwriteOption().create()); //TODO: extend this to sequential mode addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored. " + "Default is 100%", false); addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false); if (parseArguments(args) == null) { return false; } try { inputDirectory = getInputPath(); useMapRed = getOption(DefaultOptionCreator.METHOD_OPTION) .equalsIgnoreCase(DefaultOptionCreator.MAPREDUCE_METHOD); if (useMapRed) { if (!hasOption("randomSelectionPct")) { throw new OptionException(getCLIOption("randomSelectionPct"), "must set randomSelectionPct when mapRed option is used"); } if (!hasOption("mapRedOutputDir")) { throw new OptionException(getCLIOption("mapRedOutputDir"), "mapRedOutputDir must be set when mapRed option is used"); } mapRedOutputDirectory = new Path(getOption("mapRedOutputDir")); if (hasOption("keepPct")) { keepPct = Integer.parseInt(getOption("keepPct")); } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), mapRedOutputDirectory); } } else { if (!hasOption("trainingOutput") || !hasOption("testOutput")) { throw new OptionException(getCLIOption("trainingOutput"), "trainingOutput and testOutput must be set if mapRed option is not used"); } if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct") && !hasOption("randomSelectionSize")) { throw new OptionException(getCLIOption("testSplitSize"), "must set one of test split size/percentage or randomSelectionSize/percentage"); } trainingOutputDirectory = new Path(getOption("trainingOutput")); testOutputDirectory = new Path(getOption("testOutput")); FileSystem fs = trainingOutputDirectory.getFileSystem(getConf()); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(fs.getConf(), trainingOutputDirectory); HadoopUtil.delete(fs.getConf(), testOutputDirectory); } fs.mkdirs(trainingOutputDirectory); fs.mkdirs(testOutputDirectory); } if (hasOption("charset")) { charset = Charset.forName(getOption("charset")); } if (hasOption("testSplitSize") && hasOption("testSplitPct")) { throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage " + "option, not BOTH"); } if (hasOption("testSplitSize")) { setTestSplitSize(Integer.parseInt(getOption("testSplitSize"))); } if (hasOption("testSplitPct")) { setTestSplitPct(Integer.parseInt(getOption("testSplitPct"))); } if (hasOption("splitLocation")) { setSplitLocation(Integer.parseInt(getOption("splitLocation"))); } if (hasOption("randomSelectionSize")) { setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize"))); } if (hasOption("randomSelectionPct")) { setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct"))); } useSequence = hasOption("sequenceFiles"); } catch (OptionException e) { log.error("Command-line option Exception", e); CommandLineUtil.printHelp(getGroup()); return false; } validate(); return true; }
From source file:com.pocketx.gravity.recommender.cf.similarity.job.RowSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*www . j a va2s.c o m*/ addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix", false); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false)); addOption("threshold", "tr", "discard row pairs with a similarity value below this", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns; if (hasOption("numberOfColumns")) { // Number of columns explicitly specified via CLI numberOfColumns = Integer.parseInt(getOption("numberOfColumns")); } else { // else get the number of columns by determining the cardinality of a vector in the input matrix numberOfColumns = getDimensions(getInputPath()); } String similarityClassnameArg = getOption("similarityClassname"); String similarityClassname; try { similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname(); } catch (IllegalArgumentException iae) { similarityClassname = similarityClassnameArg; } // Clear the output and temp paths if the overwrite option has been set if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { // Clear the temp path HadoopUtil.delete(getConf(), getTempPath()); // Clear the output path HadoopUtil.delete(getConf(), getOutputPath()); } int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow")); boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD; Path weightsPath = getTempPath("weights"); Path normsPath = getTempPath("norms.bin"); Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin"); Path maxValuesPath = getTempPath("maxValues.bin"); Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job normsAndTranspose = prepareJob(getInputPath(), weightsPath, VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class); Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration(); normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold)); normsAndTransposeConf.set(NORMS_PATH, normsPath.toString()); normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString()); normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname); boolean succeeded = normsAndTranspose.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class, IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class); pairwiseSimilarity.setCombinerClass(VectorSumReducer.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(THRESHOLD, String.valueOf(threshold)); pairwiseConf.set(NORMS_PATH, normsPath.toString()); pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString()); pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity); boolean succeeded = pairwiseSimilarity.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class, IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class, VectorWritable.class); asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); boolean succeeded = asMatrix.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from w w w . j a v a 2s . co m*/ addOutputOption(); addOption(DefaultOptionCreator.numReducersOption().create()); addOption("maxNGramSize", "ng", "(Optional) The max size of ngrams to create (2 = bigrams, 3 = trigrams, etc) default: 2", String.valueOf(DEFAULT_MAX_NGRAM_SIZE)); addOption("minSupport", "s", "(Optional) Minimum Support. Default Value: " + CollocReducer.DEFAULT_MIN_SUPPORT, String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT)); addOption("minValue", "minV", "(Optional)The minimum value for association metric(Float) Default is " + AssocReducer.DEFAULT_MIN_VALUE, String.valueOf(AssocReducer.DEFAULT_MIN_VALUE)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption("metric", "m", "The association metric to use, one of {llr,dice,pmi,chi}", AssocReducer.DEFAULT_ASSOC); addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations"); addOption("windowSize", "ws", "(Optional) Window size"); addOption("windowMode", "wm", "(Optional) DOCUMENT, SENTENCE, S_WINDOW, C_WINDOW, FIXED"); addOption("ngramLimit", "nl", "(Optional) maximum of ngrams per unit - to prevent memory overflow"); addOption("usePos", "p", "(Optional)"); Map<String, List<String>> argMap = parseArguments(args); if (argMap == null) { return -1; } Path input = getInputPath(); Path output = getOutputPath(); int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE; if (hasOption("maxNGramSize")) { try { maxNGramSize = Integer.parseInt(getOption("maxNGramSize")); } catch (NumberFormatException ex) { log.warn("Could not parse ngram size option"); } } log.info("Maximum n-gram size is: {}", maxNGramSize); if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), output); } int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT; if (getOption("minSupport") != null) { minSupport = Integer.parseInt(getOption("minSupport")); } log.info("Minimum Support value: {}", minSupport); float minValue = AssocReducer.DEFAULT_MIN_VALUE; if (getOption("minValue") != null) { minValue = Float.parseFloat(getOption("minValue")); } log.info("Minimum Assoc value: {}", minValue); int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS; if (getOption("maxRed") != null) { reduceTasks = Integer.parseInt(getOption("maxRed")); } log.info("Number of pass1 reduce tasks: {}", reduceTasks); String metric = AssocReducer.DEFAULT_ASSOC; if (getOption("metric") != null) { metric = getOption("metric"); } log.info("Association Metric: {}", metric); Window windowType = Window.SENTENCE; if (getOption("windowMode") != null) { windowType = Window.valueOf(getOption("windowMode").toUpperCase()); } int windowSize = 3; if (getOption("windowSize") != null) { windowSize = Integer.parseInt(getOption("windowSize")); } boolean emitUnigrams = argMap.containsKey("emitUnigrams"); reduceTasks = 14; // parse input and extract collocations long ngramCount = generateCollocations(input, output, getConf(), emitUnigrams, maxNGramSize, reduceTasks, minSupport, windowType, windowSize); // tally collocations and perform LLR calculation // for (String m : metric.split(",")) { // log.info("Computing Collocations with Association Metric: {}", m); // // extract pruning thresholds // if (m.contains(":")) { // String[] tokens = m.split(":"); // m = tokens[0]; // minValue = Float.parseFloat(tokens[1]); // } computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, minValue, reduceTasks); // only emit unigrams for the first metric emitUnigrams = false; // } return 0; }