Example usage for org.apache.mahout.vectorizer SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles

List of usage examples for org.apache.mahout.vectorizer SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles.

Prototype

SparseVectorsFromSequenceFiles

Source Link

Usage

From source file:com.clustertest2.clustertest2.vectorization.MRDocVectorizer.java

@Override
public void performWork(String[] args) {
    // //from  w  w w  . j a  v a2  s .c o  m
    try {
        SparseVectorsFromSequenceFiles VectorizeJob = new SparseVectorsFromSequenceFiles();
        ToolRunner.run(VectorizeJob, args);
    } catch (Exception e) {
        System.out.println(e);
        logger.log(Level.SEVERE, "Error converting sequence file to vectors: %s", e.getClass());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:org.plista.kornakapi.core.training.FromDirectoryVectorizer.java

License:Apache License

/**
 * //from  w  w w. j  av a2  s .c o  m
 * @param tfWeighting, either if true tf(unnormalized term-frequency) else TFIDF(normalized through maxFrequncy)
 * @param named, if true output Vectors are named
 * @param maxDFSigma, Maximum Standard deviation of termfrequency, 
 * @param inputPath
 * @param outputPath
 * @throws Exception
 */
private void generateSparseVectors(boolean tfWeighting, boolean named, double maxDFSigma, Path inputPath,
        Path outputPath) throws Exception {
    List<String> argList = Lists.newLinkedList();
    argList.add("-i");
    argList.add(inputPath.toString());
    argList.add("-o");
    argList.add(outputPath.toString());
    argList.add("-seq");
    if (named) {
        argList.add("-nv");
    }
    if (maxDFSigma >= 0) {
        argList.add("--maxDFSigma");
        argList.add(String.valueOf(maxDFSigma));
    }
    if (tfWeighting) {
        argList.add("--weight");
        argList.add("tf");
    } else {
        argList.add("--weight");
        argList.add("tfidf");
    }
    String[] args = argList.toArray(new String[argList.size()]);
    //String[] seqToVectorArgs = {"--weight", "tfidf", "--input", inputPath.toString(), "--output",  outputPath.toString(), "--maxDFPercent", "70", "--maxNGramSize", "2", "--namedVector"};
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}

From source file:org.plista.kornakapi.core.training.FromFileVectorizer.java

License:Apache License

/**
 * /*w w w. j a  v a2s .  c  om*/
 * @param tfWeighting, either if true tf(unnormalized term-frequency) else TFIDF(normalized through maxFrequncy)
 * @param named, if true output Vectors are named
 * @param maxDFSigma, Maximum Standard deviation of termfrequency, 
 * @param inputPath
 * @param outputPath
 * @throws Exception
 */
private void generateSparseVectors(boolean tfWeighting, boolean named, double maxDFSigma, Path inputPath,
        Path outputPath) throws Exception {
    List<String> argList = Lists.newLinkedList();
    argList.add("-ow");
    argList.add("-i");
    argList.add(inputPath.toString());
    argList.add("-o");
    argList.add(outputPath.toString());
    argList.add("-seq");
    if (named) {
        argList.add("-nv");
    }
    if (maxDFSigma >= 0) {
        argList.add("--maxDFSigma");
        argList.add(String.valueOf(maxDFSigma));
    }
    if (tfWeighting) {
        argList.add("--weight");
        argList.add("tf");
    } else {
        argList.add("--weight");
        argList.add("tfidf");
    }
    String[] args = argList.toArray(new String[argList.size()]);
    //String[] seqToVectorArgs = {"--weight", "tfidf", "--input", inputPath.toString(), "--output",  outputPath.toString(), "--maxDFPercent", "70", "--maxNGramSize", "2", "--namedVector"};
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}

From source file:org.plista.kornakapi.core.training.FromLuceneVectorizer.java

License:Apache License

/**
 * /*w w  w .  j a  v  a  2s  .com*/
 * @param tfWeighting, either if true tf(unnormalized term-frequency) else TFIDF(normalized through maxFrequncy)
 * @param named, if true output Vectors are named
 * @param maxDFSigma, Maximum Standard deviation of termfrequency, 
 * @param inputPath
 * @param outputPath
 * @throws Exception
 */
protected void generateSparseVectors(boolean tfWeighting, boolean named, double maxDFSigma, Path inputPath,
        Path outputPath) throws Exception {
    List<String> argList = Lists.newLinkedList();
    argList.add("-i");
    argList.add(inputPath.toString());
    argList.add("-o");
    argList.add(outputPath.toString());
    argList.add("-seq");
    if (named) {
        argList.add("-nv");
    }
    if (maxDFSigma >= 0) {
        argList.add("--maxDFSigma");
        argList.add(String.valueOf(maxDFSigma));
    }
    if (tfWeighting) {
        argList.add("--weight");
        argList.add("tf");
    } else {
        argList.add("--weight");
        argList.add("tfidf");
    }
    String[] args = argList.toArray(new String[argList.size()]);
    //String[] seqToVectorArgs = {"--weight", "tfidf", "--input", inputPath.toString(), "--output",  outputPath.toString(), "--maxDFPercent", "70", "--maxNGramSize", "2", "--namedVector"};
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
}

From source file:root.benchmark.ReutersVectorizationJob.java

License:Apache License

/**
 * {@inheritDoc}/*from   w w w. j a  v  a2 s .c  o m*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    String workingDirectory = workingFS.getWorkingDirectory() + "/";

    outputDirectory = workingDirectory + outputDirectory;

    Path inputDirectoryPath = new Path(inputDirectory);
    Path outputDirectoryPath = new Path(outputDirectory);

    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    if (workingFS.delete(outputDirectoryPath, true)) {
        System.out.println("Output directory cleaned.");
    }

    sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory;
    vectorDirectory = outputDirectory + vectorDirectory;
    filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory;
    renamedInputdirectory = outputDirectory + renamedInputdirectory;

    // 1: Renames files 1-N
    System.out.println();
    System.out.println("--------------");
    System.out.println("Renaming Files");
    System.out.println("--------------");
    System.out.println("\tInput: " + inputDirectory);
    System.out.println("\tOutput: " + renamedInputdirectory);
    System.out.println();
    String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f",
            filenameDictionaryDirectory };
    ToolRunner.run(new RenameFilesJob(), arguments_renameFiles);

    // 2: Converts text to sequence file
    System.out.println();
    System.out.println("--------------------------------------");
    System.out.println("Creating Sequence Files From Directory");
    System.out.println("--------------------------------------");
    System.out.println("\tInput: " + renamedInputdirectory);
    System.out.println("\tOutput: " + sequenceFilesDirectory);
    System.out.println();
    String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o",
            sequenceFilesDirectory };
    ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory);

    // 3: Creates vectors of text
    System.out.println();
    System.out.println("----------------");
    System.out.println("Creating Vectors");
    System.out.println("----------------");
    System.out.println("\tInput: " + sequenceFilesDirectory);
    System.out.println("\tOutput: " + vectorDirectory);
    System.out.println();
    String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory,
            "-x", exclusionThreshold, "-md", minimumDocumentFrequency };
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles);

    return 0;

}

From source file:root.input.reuters21578.VectorizationJob.java

License:Apache License

/**
 * {@inheritDoc}/*from w w w .ja va  2s . c o m*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    String workingDirectory = workingFS.getWorkingDirectory() + "/";

    outputDirectory = workingDirectory + outputDirectory;

    Path inputDirectoryPath = new Path(inputDirectory);
    Path outputDirectoryPath = new Path(outputDirectory);

    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    if (workingFS.delete(outputDirectoryPath, true)) {
        System.out.println("Output directory cleaned.");
    }

    sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory;
    vectorDirectory = outputDirectory + vectorDirectory;
    similarityMatrixDirectory = outputDirectory + similarityMatrixDirectory;
    filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory;
    renamedInputdirectory = outputDirectory + renamedInputdirectory;

    // 1: Renames files 1-N
    System.out.println();
    System.out.println("--------------");
    System.out.println("Renaming Files");
    System.out.println("--------------");
    System.out.println("\tInput: " + inputDirectory);
    System.out.println("\tOutput: " + renamedInputdirectory);
    System.out.println();
    String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f",
            filenameDictionaryDirectory };
    ToolRunner.run(new RenameFilesJob(), arguments_renameFiles);

    // 2: Converts text to sequence file
    System.out.println();
    System.out.println("--------------------------------------");
    System.out.println("Creating Sequence Files From Directory");
    System.out.println("--------------------------------------");
    System.out.println("\tInput: " + renamedInputdirectory);
    System.out.println("\tOutput: " + sequenceFilesDirectory);
    System.out.println();
    String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o",
            sequenceFilesDirectory };
    ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory);

    // 3: Creates vectors of text
    System.out.println();
    System.out.println("----------------");
    System.out.println("Creating Vectors");
    System.out.println("----------------");
    System.out.println("\tInput: " + sequenceFilesDirectory);
    System.out.println("\tOutput: " + vectorDirectory);
    System.out.println();
    String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory,
            "-x", exclusionThreshold, "-md", minimumDocumentFrequency };
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles);

    // 4: Create a similarity matrix.
    System.out.println();
    System.out.println("--------------------------");
    System.out.println("Creating Similarity Matrix");
    System.out.println("--------------------------");
    System.out.println("\tInput: " + vectorDirectory + "/" + tf_tfidf + "-vectors");
    System.out.println("\tOutput: " + similarityMatrixDirectory);
    System.out.println("\tLevels: " + numLevels);
    System.out.println();
    String[] arguments_CreateSimilaritySimilarityJob = { "-i", vectorDirectory + "/" + tf_tfidf + "-vectors",
            "-o", similarityMatrixDirectory, "-dm", distanceMetric, "-l", numLevels, "-smd", diagScale };
    ToolRunner.run(new CreateSimilarityMatrixJob(), arguments_CreateSimilaritySimilarityJob);

    return 0;

}