Example usage for java.lang Float POSITIVE_INFINITY

List of usage examples for java.lang Float POSITIVE_INFINITY

Introduction

In this page you can find the example usage for java.lang Float POSITIVE_INFINITY.

Prototype

float POSITIVE_INFINITY

To view the source code for java.lang Float POSITIVE_INFINITY.

Click Source Link

Document

A constant holding the positive infinity of type float .

Usage

From source file:io.github.gjyaiya.stetho.realm.Database.java

private List<Object> flattenRows(Table table, long limit, boolean addRowIndex) {
    Util.throwIfNot(limit >= 0);
    final List<Object> flatList = new ArrayList<>();
    long numColumns = table.getColumnCount();

    final RowFetcher rowFetcher = RowFetcher.getInstance();
    final long tableSize = table.size();
    for (long index = 0; index < limit && index < tableSize; index++) {
        final long row = ascendingOrder ? index : (tableSize - index - 1);
        final RowWrapper rowData = RowWrapper.wrap(rowFetcher.getRow(table, row));
        if (addRowIndex) {
            flatList.add(rowData.getIndex());
        }/*from   www  .  j a  va 2  s  .  c o  m*/
        for (int column = 0; column < numColumns; column++) {
            switch (rowData.getColumnType(column)) {
            case INTEGER:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    flatList.add(rowData.getLong(column));
                }
                break;
            case BOOLEAN:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    flatList.add(rowData.getBoolean(column));
                }
                break;
            case STRING:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    flatList.add(rowData.getString(column));
                }
                break;
            case BINARY:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    flatList.add(rowData.getBinaryByteArray(column));
                }
                break;
            case FLOAT:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    final float aFloat = rowData.getFloat(column);
                    if (Float.isNaN(aFloat)) {
                        flatList.add("NaN");
                    } else if (aFloat == Float.POSITIVE_INFINITY) {
                        flatList.add("Infinity");
                    } else if (aFloat == Float.NEGATIVE_INFINITY) {
                        flatList.add("-Infinity");
                    } else {
                        flatList.add(aFloat);
                    }
                }
                break;
            case DOUBLE:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    final double aDouble = rowData.getDouble(column);
                    if (Double.isNaN(aDouble)) {
                        flatList.add("NaN");
                    } else if (aDouble == Double.POSITIVE_INFINITY) {
                        flatList.add("Infinity");
                    } else if (aDouble == Double.NEGATIVE_INFINITY) {
                        flatList.add("-Infinity");
                    } else {
                        flatList.add(aDouble);
                    }
                }
                break;
            case OLD_DATE:
            case DATE:
                if (rowData.isNull(column)) {
                    flatList.add(NULL);
                } else {
                    flatList.add(formatDate(rowData.getDate(column)));
                }
                break;
            case OBJECT:
                if (rowData.isNullLink(column)) {
                    flatList.add(NULL);
                } else {
                    flatList.add(rowData.getLink(column));
                }
                break;
            case LIST:
                // LIST never be null
                flatList.add(formatList(rowData.getLinkList(column)));
                break;
            default:
                flatList.add("unknown column type: " + rowData.getColumnType(column));
                break;
            }
        }
    }

    if (limit < table.size()) {
        for (int column = 0; column < numColumns; column++) {
            flatList.add("{truncated}");
        }
    }

    return flatList;
}

From source file:org.shaman.terrain.polygonal.GraphToHeightmap.java

private static void findClosestTwoPoints(Vector3f[] points, float px, float py, Vector3f first,
        Vector3f second) {/* ww  w.j av a  2 s .c o m*/
    float dist1 = Float.POSITIVE_INFINITY;
    float dist2 = Float.POSITIVE_INFINITY;
    for (Vector3f p : points) {
        float d = dist(p, px, py);
        if (d < dist1) {
            dist2 = dist1;
            second.set(first);
            dist1 = d;
            first.set(p);
        } else if (d < dist2) {
            dist2 = d;
            second.set(p);
        }
    }
}

From source file:net.qvex.dommel.data.DommelDataService.java

public boolean getData(String username, String password) throws ClientProtocolException, IOException {

    // adapted from phptelemeter
    unlimited = false;/*from w  w  w .  j a  v a2  s  . c  o m*/

    /* login */
    Map<String, String> urlParameters = new HashMap<String, String>();
    urlParameters.put("op", "login");
    urlParameters.put("new_language", "english");
    urlParameters.put("submit", "login");
    urlParameters.put("username", username);
    urlParameters.put("password", password);

    String res;

    //        try{
    res = this.httpPost(URL_LOGIN, urlParameters);
    //
    //        }catch(Exception e)
    //        {
    //        // TODO: check for errors in res. possibly just catch nullpointer
    //        // exception..
    //            throw new Exception(e);
    //        }

    /* go to the packages page, and get the serv_id and client_id */
    res = this.httpGet(URL_PACKAGES);
    // TODO: check for errors in res

    String[] lines = res.split("\n");
    String log = null;
    int pos = 0;
    /* figure out the stats exact url */
    for (int i = 0; i < lines.length; i++) {
        pos = lines[i].indexOf(URL_STATS_INIT);
        if (pos >= 0) {
            log = lines[i].substring(pos);
            break;
        }
    }

    String url_stats = log.substring(0, log.indexOf("'"));

    /* and get the data */
    String data = this.httpGet(url_stats);

    /* logout */
    res = this.httpGet(URL_LOGOUT);

    lines = data.split("/n");
    String data2 = null;
    pos = 0;

    /* find the entry position */
    for (int i = 0; i < lines.length; i++) {
        pos = lines[i].indexOf("total traffic downloaded in broadband");
        if (pos >= 0) {
            data2 = lines[i].substring(pos);
            break;
        }
    }

    lines = data2.split("<br>");

    /* set some default positions */
    int pos_remaining = -1;
    int pos_traffic = -1;
    int pos_reset_date = -1;
    int pos_total = -1;
    @SuppressWarnings("unused")
    int strpos_total = -1;

    /* position finding & data cleanup */
    for (int i = 0; i < lines.length; i++) {
        lines[i] = stripTags(lines[i]);

        //System.out.println(lines[i]);

        if (lines[i].contains("total traffic downloaded")) {
            pos_traffic = i;
        } else if (lines[i].contains("next counter reset")) {
            pos_reset_date = i;
        } else if (lines[i].contains("remaining")) {
            pos_remaining = i;
            if (lines[i].contains("unlimited")) {
                unlimited = true;
            }
        } else if (lines[i].contains("maximum datatransfer")) {
            pos_total = i;
            /* data cleanup */
            int test_ind = lines[i].indexOf("maximum datatransfer:");
            if (test_ind >= 0 && test_ind + 1 < lines[i].length()) {
                lines[i] = lines[i].substring(test_ind + 21);
            }
        }

        /* data cleanup */
        int test_ind = lines[i].indexOf(":");
        if (test_ind >= 0 && test_ind + 1 < lines[i].length()) {
            lines[i] = lines[i].substring(test_ind + 2);
        }

    }

    /* stats */
    /* total used */
    volume_used = Float.parseFloat(lines[pos_traffic].substring(0, lines[pos_traffic].length() - 3)) * 1024;

    volume_remaining = 0;

    /* remaining, if exists? */
    if (pos_remaining >= 0) {
        if (!unlimited) {
            volume_remaining = Float
                    .parseFloat(lines[pos_remaining].substring(0, lines[pos_remaining].length() - 3)) * 1024;

            if (pos_total >= 0) {
                volume_total = Float.parseFloat(lines[pos_total].substring(0, 4)) * 1024;
            }
        } else {
            // Unlimited account
            volume_remaining = Float.POSITIVE_INFINITY;
        }
    }

    /* reset date */
    String reset_date_str = lines[pos_reset_date].substring(0, 10);
    DateFormat df = new SimpleDateFormat("dd/MM/yyyy");
    try {
        reset_date.setTime(df.parse(reset_date_str));
    } catch (ParseException e) {
        e.printStackTrace();
        // TODO: handle error
        return false;
    }

    days_left = calculateDaysLeft(reset_date);

    Date now = new Date();

    Editor edit = prefs.edit();
    edit.putFloat("volume_used", volume_used);
    edit.putFloat("volume_remaining", volume_remaining);
    edit.putFloat("volume_total", volume_total);
    edit.putLong("reset_date", reset_date.getTimeInMillis());
    edit.putInt("days_left", days_left);
    edit.putBoolean("unlimited", unlimited);
    edit.putLong("last_update", now.getTime());
    edit.putBoolean("last_update_success", true);

    return edit.commit();

}

From source file:net.sf.json.TestJSONArray.java

public void testConstructor_primitive_array_float_Infinity() {
    try {//from   w w  w  .j  a va  2 s.  c  om
        JSONArray.fromObject(new float[] { Float.NEGATIVE_INFINITY });
        fail("Should have thrown a JSONException");
    } catch (JSONException expected) {
        // OK
    }

    try {
        JSONArray.fromObject(new float[] { Float.POSITIVE_INFINITY });
        fail("Should have thrown a JSONException");
    } catch (JSONException expected) {
        // OK
    }
}

From source file:edworld.pdfreader4humans.PDFReader.java

protected Component createGroup(int groupIndex, Map<Component, Integer> groupMap) {
    float fromX = Float.POSITIVE_INFINITY;
    float fromY = Float.POSITIVE_INFINITY;
    float toX = Float.NEGATIVE_INFINITY;
    float toY = Float.NEGATIVE_INFINITY;
    for (Component component : groupMap.keySet())
        if (groupMap.get(component) == groupIndex) {
            fromX = Math.min(component.getFromX(), fromX);
            fromY = Math.min(component.getFromY(), fromY);
            toX = Math.max(component.getToX(), toX);
            toY = Math.max(component.getToY(), toY);
        }//from w  ww.  j  a  va  2 s. co m
    return new GroupComponent(fromX, fromY, toX, toY);
}

From source file:wqm.web.server.controller.WQMCalibrationController.java

private void ecCalibrateCommand(HttpSession session, Station station, AtlasSensor sensor, int phaseID,
        String command, ECSensorProbe ec_sensor_type) {
    logger.error("EC Calibrate Command.");
    if (CalibrationCommands.Accept.commandEquals(command)) {
        stationManager.acceptCalibrationPhase(session, true, station, sensor, phaseID,
                ec_sensor_type.getPacketVariable(), Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY);
        if ((phaseID + 1) >= sensor.getCalibrationPhases()) {
            //We have finished all the phases of calibration for this sensor.
            stationManager.quitCalibrationPhase(session, station, sensor);
            session.setAttribute(Messages.SUCCESS_MESSAGE, "EC Sensor calibrated.");
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                logger.error("", e);
            }//from  w  ww .ja va2  s  . c o  m
            throw new RedirectException("/");
        }
        throw new RedirectException(
                String.format("/wqm/c/%s/%d/%d", station.getCompactAddress(), sensor.getId(), phaseID + 1));
    }

}

From source file:gedi.util.ArrayUtils.java

public static int argmin(float[] a) {
    float re = Float.POSITIVE_INFINITY;
    int arg = -1;
    for (int i = 0; i < a.length; i++) {
        if (a[i] < re) {
            re = a[i];// w w  w.ja v  a  2s. c o m
            arg = i;
        }
    }
    return arg;
}

From source file:org.opentripplanner.routing.algorithm.strategies.WeightTable.java

private void floyd() {
    LOG.debug("Floyd");
    int n = table.length;
    for (int k = 0; k < n; k++) {
        for (int i = 0; i < n; i++) {
            double ik = table[i][k];
            if (ik == Float.POSITIVE_INFINITY)
                continue;
            for (int j = 0; j < n; j++) {
                double kj = table[k][j];
                if (kj == Float.POSITIVE_INFINITY)
                    continue;
                double ikj = ik + kj;
                double ij = table[i][j];
                if (ikj < ij)
                    table[i][j] = (float) ikj;
            }/*from w ww  .ja va2  s.c  om*/
        }
        if (k % 50 == 0)
            LOG.debug("k=" + k + "/" + n);
    }
}

From source file:com.elex.dmp.vectorizer.SparseVectorsFromSequenceFiles.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. 100-10000 MB").withShortName("chunk").create();

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(//from w  w  w .  ja  v  a 2s  .  c  o m
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors."
                            + "  Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less then 0 no vectors "
                            + "will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }

        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;

        if (!processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }
        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
            Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

            // Calculate the standard deviation
            double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDF, conf,
                        docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}

From source file:org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles.java

@Override
public int run(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option inputDirOpt = DefaultOptionCreator.inputOption().create();

    Option outputDirOpt = DefaultOptionCreator.outputOption().create();

    Option minSupportOpt = obuilder.withLongName("minSupport")
            .withArgument(abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();

    Option analyzerNameOpt = obuilder.withLongName("analyzerName")
            .withArgument(abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create())
            .withDescription("The class name of the analyzer").withShortName("a").create();

    Option chunkSizeOpt = obuilder.withLongName("chunkSize")
            .withArgument(abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
            .withDescription("The chunkSize in MegaBytes. Default Value: 100MB").withShortName("chunk")
            .create();// ww  w . jav  a2 s  .c  o m

    Option weightOpt = obuilder.withLongName("weight").withRequired(false)
            .withArgument(abuilder.withName("weight").withMinimum(1).withMaximum(1).create())
            .withDescription("The kind of weight to use. Currently TF or TFIDF. Default: TFIDF")
            .withShortName("wt").create();

    Option minDFOpt = obuilder.withLongName("minDF").withRequired(false)
            .withArgument(abuilder.withName("minDF").withMinimum(1).withMaximum(1).create())
            .withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();

    Option maxDFPercentOpt = obuilder.withLongName("maxDFPercent").withRequired(false)
            .withArgument(abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The max percentage of docs for the DF.  Can be used to remove really high frequency terms."
                            + " Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, "
                            + "it will override this value.")
            .withShortName("x").create();

    Option maxDFSigmaOpt = obuilder.withLongName("maxDFSigma").withRequired(false)
            .withArgument(abuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) "
                            + "of the document frequencies of these vectors. Can be used to remove really high frequency terms."
                            + " Expressed as a double value. Good value to be specified is 3.0. In case the value is less "
                            + "than 0 no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent")
            .withShortName("xs").create();

    Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false)
            .withArgument(abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is "
                    + LLRReducer.DEFAULT_MIN_LLR)
            .withShortName("ml").create();

    Option numReduceTasksOpt = obuilder.withLongName("numReducers")
            .withArgument(abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr")
            .create();

    Option powerOpt = obuilder.withLongName("norm").withRequired(false)
            .withArgument(abuilder.withName("norm").withMinimum(1).withMaximum(1).create())
            .withDescription(
                    "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  "
                            + "Must be greater or equal to 0.  The default is not to normalize")
            .withShortName("n").create();

    Option logNormalizeOpt = obuilder.withLongName("logNormalize").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false")
            .withShortName("lnorm").create();

    Option maxNGramSizeOpt = obuilder.withLongName("maxNGramSize").withRequired(false)
            .withArgument(abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("(Optional) The maximum size of ngrams to create"
                    + " (2 = bigrams, 3 = trigrams, etc) Default Value:1")
            .withShortName("ng").create();

    Option sequentialAccessVectorOpt = obuilder.withLongName("sequentialAccessVector").withRequired(false)
            .withDescription(
                    "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false")
            .withShortName("seq").create();

    Option namedVectorOpt = obuilder.withLongName("namedVector").withRequired(false)
            .withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false")
            .withShortName("nv").create();

    Option overwriteOutput = obuilder.withLongName("overwrite").withRequired(false)
            .withDescription("If set, overwrite the output directory").withShortName("ow").create();
    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
            .create();

    Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
            .withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
            .withOption(maxDFSigmaOpt).withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt)
            .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt)
            .withOption(overwriteOutput).withOption(helpOpt).withOption(sequentialAccessVectorOpt)
            .withOption(namedVectorOpt).withOption(logNormalizeOpt).create();
    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);

        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return -1;
        }

        Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
        Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));

        int chunkSize = 100;
        if (cmdLine.hasOption(chunkSizeOpt)) {
            chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
        }
        int minSupport = 2;
        if (cmdLine.hasOption(minSupportOpt)) {
            String minSupportString = (String) cmdLine.getValue(minSupportOpt);
            minSupport = Integer.parseInt(minSupportString);
        }

        int maxNGramSize = 1;

        if (cmdLine.hasOption(maxNGramSizeOpt)) {
            try {
                maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
            } catch (NumberFormatException ex) {
                log.warn("Could not parse ngram size option");
            }
        }
        log.info("Maximum n-gram size is: {}", maxNGramSize);

        if (cmdLine.hasOption(overwriteOutput)) {
            HadoopUtil.delete(getConf(), outputDir);
        }

        float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
        if (cmdLine.hasOption(minLLROpt)) {
            minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
        }
        log.info("Minimum LLR value: {}", minLLRValue);

        int reduceTasks = 1;
        if (cmdLine.hasOption(numReduceTasksOpt)) {
            reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
        }
        log.info("Number of reduce tasks: {}", reduceTasks);

        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
        if (cmdLine.hasOption(analyzerNameOpt)) {
            String className = cmdLine.getValue(analyzerNameOpt).toString();
            analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
            // try instantiating it, b/c there isn't any point in setting it if
            // you can't instantiate it
            AnalyzerUtils.createAnalyzer(analyzerClass);
        }

        boolean processIdf;

        if (cmdLine.hasOption(weightOpt)) {
            String wString = cmdLine.getValue(weightOpt).toString();
            if ("tf".equalsIgnoreCase(wString)) {
                processIdf = false;
            } else if ("tfidf".equalsIgnoreCase(wString)) {
                processIdf = true;
            } else {
                throw new OptionException(weightOpt);
            }
        } else {
            processIdf = true;
        }

        int minDf = 1;
        if (cmdLine.hasOption(minDFOpt)) {
            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
        }
        int maxDFPercent = 99;
        if (cmdLine.hasOption(maxDFPercentOpt)) {
            maxDFPercent = Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
        }
        double maxDFSigma = -1.0;
        if (cmdLine.hasOption(maxDFSigmaOpt)) {
            maxDFSigma = Double.parseDouble(cmdLine.getValue(maxDFSigmaOpt).toString());
        }

        float norm = PartialVectorMerger.NO_NORMALIZING;
        if (cmdLine.hasOption(powerOpt)) {
            String power = cmdLine.getValue(powerOpt).toString();
            if ("INF".equals(power)) {
                norm = Float.POSITIVE_INFINITY;
            } else {
                norm = Float.parseFloat(power);
            }
        }

        boolean logNormalize = false;
        if (cmdLine.hasOption(logNormalizeOpt)) {
            logNormalize = true;
        }
        log.info("Tokenizing documents in {}", inputDir);
        Configuration conf = getConf();
        Path tokenizedPath = new Path(outputDir, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
        //TODO: move this into DictionaryVectorizer , and then fold SparseVectorsFrom with EncodedVectorsFrom
        // to have one framework for all of this.
        DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath, conf);

        boolean sequentialAccessOutput = false;
        if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
            sequentialAccessOutput = true;
        }

        boolean namedVectors = false;
        if (cmdLine.hasOption(namedVectorOpt)) {
            namedVectors = true;
        }
        boolean shouldPrune = maxDFSigma >= 0.0 || maxDFPercent > 0.00;
        String tfDirName = shouldPrune ? DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-toprune"
                : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
        log.info("Creating Term Frequency Vectors");
        if (processIdf) {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, -1.0f, false, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        } else {
            DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, tfDirName, conf,
                    minSupport, maxNGramSize, minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
                    sequentialAccessOutput, namedVectors);
        }

        Pair<Long[], List<Path>> docFrequenciesFeatures = null;
        // Should document frequency features be processed
        if (shouldPrune || processIdf) {
            log.info("Calculating IDF");
            docFrequenciesFeatures = TFIDFConverter.calculateDF(new Path(outputDir, tfDirName), outputDir, conf,
                    chunkSize);
        }

        long maxDF = maxDFPercent; //if we are pruning by std dev, then this will get changed
        if (shouldPrune) {
            long vectorCount = docFrequenciesFeatures.getFirst()[1];
            if (maxDFSigma >= 0.0) {
                Path dfDir = new Path(outputDir, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER);
                Path stdCalcDir = new Path(outputDir, HighDFWordsPruner.STD_CALC_DIR);

                // Calculate the standard deviation
                double stdDev = BasicStats.stdDevForGivenMean(dfDir, stdCalcDir, 0.0, conf);
                maxDF = (int) (100.0 * maxDFSigma * stdDev / vectorCount);
            }

            long maxDFThreshold = (long) (vectorCount * (maxDF / 100.0f));

            // Prune the term frequency vectors
            Path tfDir = new Path(outputDir, tfDirName);
            Path prunedTFDir = new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
            Path prunedPartialTFDir = new Path(outputDir,
                    DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER + "-partial");
            log.info("Pruning");
            if (processIdf) {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, -1.0f, false, reduceTasks);
            } else {
                HighDFWordsPruner.pruneVectors(tfDir, prunedTFDir, prunedPartialTFDir, maxDFThreshold, minDf,
                        conf, docFrequenciesFeatures, norm, logNormalize, reduceTasks);
            }
            HadoopUtil.delete(new Configuration(conf), tfDir);
        }
        if (processIdf) {
            TFIDFConverter.processTfIdf(new Path(outputDir, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
                    outputDir, conf, docFrequenciesFeatures, minDf, maxDF, norm, logNormalize,
                    sequentialAccessOutput, namedVectors, reduceTasks);
        }
    } catch (OptionException e) {
        log.error("Exception", e);
        CommandLineUtil.printHelp(group);
    }
    return 0;
}