Example usage for org.apache.mahout.text SequenceFilesFromDirectory SequenceFilesFromDirectory

List of usage examples for org.apache.mahout.text SequenceFilesFromDirectory SequenceFilesFromDirectory

Introduction

In this page you can find the example usage for org.apache.mahout.text SequenceFilesFromDirectory SequenceFilesFromDirectory.

Prototype

SequenceFilesFromDirectory

Source Link

Usage

From source file:org.plista.kornakapi.core.training.FromDirectoryVectorizer.java

License:Apache License

private void generateSequneceFiles() {
    List<String> argList = Lists.newLinkedList();
    argList.add("-i");
    argList.add(DocumentFilesPath.toString());
    argList.add("-o");
    argList.add(sequenceFilesPath.toString());
    argList.add("-ow");
    String[] args = argList.toArray(new String[argList.size()]);
    try {//from w  w w  .j a v a 2  s .  co  m
        ToolRunner.run(new SequenceFilesFromDirectory(), args);
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:org.plista.kornakapi.core.training.FromFileVectorizer.java

License:Apache License

/**
 * Generates SequenceFile/*from   www  .  ja  v a  2s.co m*/
 * @throws Exception 
 */
private void generateSequneceFiles() throws Exception {
    List<String> argList = Lists.newLinkedList();
    argList.add("-i");
    argList.add(DocumentFilesPath.toString());
    argList.add("-o");
    argList.add(sequenceFilesPath.toString());
    argList.add("-ow");
    String[] args = argList.toArray(new String[argList.size()]);
    ToolRunner.run(new SequenceFilesFromDirectory(), args);

}

From source file:root.benchmark.ReutersVectorizationJob.java

License:Apache License

/**
 * {@inheritDoc}//from ww  w .j  av a2  s . c o  m
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    String workingDirectory = workingFS.getWorkingDirectory() + "/";

    outputDirectory = workingDirectory + outputDirectory;

    Path inputDirectoryPath = new Path(inputDirectory);
    Path outputDirectoryPath = new Path(outputDirectory);

    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    if (workingFS.delete(outputDirectoryPath, true)) {
        System.out.println("Output directory cleaned.");
    }

    sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory;
    vectorDirectory = outputDirectory + vectorDirectory;
    filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory;
    renamedInputdirectory = outputDirectory + renamedInputdirectory;

    // 1: Renames files 1-N
    System.out.println();
    System.out.println("--------------");
    System.out.println("Renaming Files");
    System.out.println("--------------");
    System.out.println("\tInput: " + inputDirectory);
    System.out.println("\tOutput: " + renamedInputdirectory);
    System.out.println();
    String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f",
            filenameDictionaryDirectory };
    ToolRunner.run(new RenameFilesJob(), arguments_renameFiles);

    // 2: Converts text to sequence file
    System.out.println();
    System.out.println("--------------------------------------");
    System.out.println("Creating Sequence Files From Directory");
    System.out.println("--------------------------------------");
    System.out.println("\tInput: " + renamedInputdirectory);
    System.out.println("\tOutput: " + sequenceFilesDirectory);
    System.out.println();
    String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o",
            sequenceFilesDirectory };
    ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory);

    // 3: Creates vectors of text
    System.out.println();
    System.out.println("----------------");
    System.out.println("Creating Vectors");
    System.out.println("----------------");
    System.out.println("\tInput: " + sequenceFilesDirectory);
    System.out.println("\tOutput: " + vectorDirectory);
    System.out.println();
    String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory,
            "-x", exclusionThreshold, "-md", minimumDocumentFrequency };
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles);

    return 0;

}

From source file:root.input.reuters21578.VectorizationJob.java

License:Apache License

/**
 * {@inheritDoc}/*from   ww w .  j a v  a 2s  . c  o  m*/
 */
@Override
public int run(String[] args) throws Exception {

    constructParameterList();

    if (parseArguments(args) == null) {
        return -1;
    }

    initializeConfigurationParameters();

    printJobHeader();

    Configuration conf = getConf();

    URI workingURI = new URI(conf.get("fs.default.name"));
    URI inputURI = new URI(inputDirectory);

    FileSystem workingFS = FileSystem.get(workingURI, conf);
    FileSystem inputFS = FileSystem.get(inputURI, conf);

    String workingDirectory = workingFS.getWorkingDirectory() + "/";

    outputDirectory = workingDirectory + outputDirectory;

    Path inputDirectoryPath = new Path(inputDirectory);
    Path outputDirectoryPath = new Path(outputDirectory);

    if (!inputFS.exists(inputDirectoryPath)) {
        throw new Exception("Input directory not found.");
    }
    if (workingFS.delete(outputDirectoryPath, true)) {
        System.out.println("Output directory cleaned.");
    }

    sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory;
    vectorDirectory = outputDirectory + vectorDirectory;
    similarityMatrixDirectory = outputDirectory + similarityMatrixDirectory;
    filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory;
    renamedInputdirectory = outputDirectory + renamedInputdirectory;

    // 1: Renames files 1-N
    System.out.println();
    System.out.println("--------------");
    System.out.println("Renaming Files");
    System.out.println("--------------");
    System.out.println("\tInput: " + inputDirectory);
    System.out.println("\tOutput: " + renamedInputdirectory);
    System.out.println();
    String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f",
            filenameDictionaryDirectory };
    ToolRunner.run(new RenameFilesJob(), arguments_renameFiles);

    // 2: Converts text to sequence file
    System.out.println();
    System.out.println("--------------------------------------");
    System.out.println("Creating Sequence Files From Directory");
    System.out.println("--------------------------------------");
    System.out.println("\tInput: " + renamedInputdirectory);
    System.out.println("\tOutput: " + sequenceFilesDirectory);
    System.out.println();
    String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o",
            sequenceFilesDirectory };
    ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory);

    // 3: Creates vectors of text
    System.out.println();
    System.out.println("----------------");
    System.out.println("Creating Vectors");
    System.out.println("----------------");
    System.out.println("\tInput: " + sequenceFilesDirectory);
    System.out.println("\tOutput: " + vectorDirectory);
    System.out.println();
    String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory,
            "-x", exclusionThreshold, "-md", minimumDocumentFrequency };
    ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles);

    // 4: Create a similarity matrix.
    System.out.println();
    System.out.println("--------------------------");
    System.out.println("Creating Similarity Matrix");
    System.out.println("--------------------------");
    System.out.println("\tInput: " + vectorDirectory + "/" + tf_tfidf + "-vectors");
    System.out.println("\tOutput: " + similarityMatrixDirectory);
    System.out.println("\tLevels: " + numLevels);
    System.out.println();
    String[] arguments_CreateSimilaritySimilarityJob = { "-i", vectorDirectory + "/" + tf_tfidf + "-vectors",
            "-o", similarityMatrixDirectory, "-dm", distanceMetric, "-l", numLevels, "-smd", diagScale };
    ToolRunner.run(new CreateSimilarityMatrixJob(), arguments_CreateSimilaritySimilarityJob);

    return 0;

}