List of usage examples for org.apache.mahout.vectorizer SparseVectorsFromSequenceFiles SparseVectorsFromSequenceFiles
SparseVectorsFromSequenceFiles
From source file:com.clustertest2.clustertest2.vectorization.MRDocVectorizer.java
@Override public void performWork(String[] args) { // //from w w w . j a v a2 s .c o m try { SparseVectorsFromSequenceFiles VectorizeJob = new SparseVectorsFromSequenceFiles(); ToolRunner.run(VectorizeJob, args); } catch (Exception e) { System.out.println(e); logger.log(Level.SEVERE, "Error converting sequence file to vectors: %s", e.getClass()); } finally { numThreads.decrementAndGet(); } }
From source file:org.plista.kornakapi.core.training.FromDirectoryVectorizer.java
License:Apache License
/** * //from w w w. j av a2 s .c o m * @param tfWeighting, either if true tf(unnormalized term-frequency) else TFIDF(normalized through maxFrequncy) * @param named, if true output Vectors are named * @param maxDFSigma, Maximum Standard deviation of termfrequency, * @param inputPath * @param outputPath * @throws Exception */ private void generateSparseVectors(boolean tfWeighting, boolean named, double maxDFSigma, Path inputPath, Path outputPath) throws Exception { List<String> argList = Lists.newLinkedList(); argList.add("-i"); argList.add(inputPath.toString()); argList.add("-o"); argList.add(outputPath.toString()); argList.add("-seq"); if (named) { argList.add("-nv"); } if (maxDFSigma >= 0) { argList.add("--maxDFSigma"); argList.add(String.valueOf(maxDFSigma)); } if (tfWeighting) { argList.add("--weight"); argList.add("tf"); } else { argList.add("--weight"); argList.add("tfidf"); } String[] args = argList.toArray(new String[argList.size()]); //String[] seqToVectorArgs = {"--weight", "tfidf", "--input", inputPath.toString(), "--output", outputPath.toString(), "--maxDFPercent", "70", "--maxNGramSize", "2", "--namedVector"}; ToolRunner.run(new SparseVectorsFromSequenceFiles(), args); }
From source file:org.plista.kornakapi.core.training.FromFileVectorizer.java
License:Apache License
/** * /*w w w. j a v a2s . c om*/ * @param tfWeighting, either if true tf(unnormalized term-frequency) else TFIDF(normalized through maxFrequncy) * @param named, if true output Vectors are named * @param maxDFSigma, Maximum Standard deviation of termfrequency, * @param inputPath * @param outputPath * @throws Exception */ private void generateSparseVectors(boolean tfWeighting, boolean named, double maxDFSigma, Path inputPath, Path outputPath) throws Exception { List<String> argList = Lists.newLinkedList(); argList.add("-ow"); argList.add("-i"); argList.add(inputPath.toString()); argList.add("-o"); argList.add(outputPath.toString()); argList.add("-seq"); if (named) { argList.add("-nv"); } if (maxDFSigma >= 0) { argList.add("--maxDFSigma"); argList.add(String.valueOf(maxDFSigma)); } if (tfWeighting) { argList.add("--weight"); argList.add("tf"); } else { argList.add("--weight"); argList.add("tfidf"); } String[] args = argList.toArray(new String[argList.size()]); //String[] seqToVectorArgs = {"--weight", "tfidf", "--input", inputPath.toString(), "--output", outputPath.toString(), "--maxDFPercent", "70", "--maxNGramSize", "2", "--namedVector"}; ToolRunner.run(new SparseVectorsFromSequenceFiles(), args); }
From source file:org.plista.kornakapi.core.training.FromLuceneVectorizer.java
License:Apache License
/** * /*w w w . j a v a 2s .com*/ * @param tfWeighting, either if true tf(unnormalized term-frequency) else TFIDF(normalized through maxFrequncy) * @param named, if true output Vectors are named * @param maxDFSigma, Maximum Standard deviation of termfrequency, * @param inputPath * @param outputPath * @throws Exception */ protected void generateSparseVectors(boolean tfWeighting, boolean named, double maxDFSigma, Path inputPath, Path outputPath) throws Exception { List<String> argList = Lists.newLinkedList(); argList.add("-i"); argList.add(inputPath.toString()); argList.add("-o"); argList.add(outputPath.toString()); argList.add("-seq"); if (named) { argList.add("-nv"); } if (maxDFSigma >= 0) { argList.add("--maxDFSigma"); argList.add(String.valueOf(maxDFSigma)); } if (tfWeighting) { argList.add("--weight"); argList.add("tf"); } else { argList.add("--weight"); argList.add("tfidf"); } String[] args = argList.toArray(new String[argList.size()]); //String[] seqToVectorArgs = {"--weight", "tfidf", "--input", inputPath.toString(), "--output", outputPath.toString(), "--maxDFPercent", "70", "--maxNGramSize", "2", "--namedVector"}; ToolRunner.run(new SparseVectorsFromSequenceFiles(), args); }
From source file:root.benchmark.ReutersVectorizationJob.java
License:Apache License
/** * {@inheritDoc}/*from w w w. j a v a2 s .c o m*/ */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); String workingDirectory = workingFS.getWorkingDirectory() + "/"; outputDirectory = workingDirectory + outputDirectory; Path inputDirectoryPath = new Path(inputDirectory); Path outputDirectoryPath = new Path(outputDirectory); if (!inputFS.exists(inputDirectoryPath)) { throw new Exception("Input directory not found."); } if (workingFS.delete(outputDirectoryPath, true)) { System.out.println("Output directory cleaned."); } sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory; vectorDirectory = outputDirectory + vectorDirectory; filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory; renamedInputdirectory = outputDirectory + renamedInputdirectory; // 1: Renames files 1-N System.out.println(); System.out.println("--------------"); System.out.println("Renaming Files"); System.out.println("--------------"); System.out.println("\tInput: " + inputDirectory); System.out.println("\tOutput: " + renamedInputdirectory); System.out.println(); String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f", filenameDictionaryDirectory }; ToolRunner.run(new RenameFilesJob(), arguments_renameFiles); // 2: Converts text to sequence file System.out.println(); System.out.println("--------------------------------------"); System.out.println("Creating Sequence Files From Directory"); System.out.println("--------------------------------------"); System.out.println("\tInput: " + renamedInputdirectory); System.out.println("\tOutput: " + sequenceFilesDirectory); System.out.println(); String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o", sequenceFilesDirectory }; ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory); // 3: Creates vectors of text System.out.println(); System.out.println("----------------"); System.out.println("Creating Vectors"); System.out.println("----------------"); System.out.println("\tInput: " + sequenceFilesDirectory); System.out.println("\tOutput: " + vectorDirectory); System.out.println(); String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory, "-x", exclusionThreshold, "-md", minimumDocumentFrequency }; ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles); return 0; }
From source file:root.input.reuters21578.VectorizationJob.java
License:Apache License
/** * {@inheritDoc}/*from w w w .ja va 2s . c o m*/ */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); String workingDirectory = workingFS.getWorkingDirectory() + "/"; outputDirectory = workingDirectory + outputDirectory; Path inputDirectoryPath = new Path(inputDirectory); Path outputDirectoryPath = new Path(outputDirectory); if (!inputFS.exists(inputDirectoryPath)) { throw new Exception("Input directory not found."); } if (workingFS.delete(outputDirectoryPath, true)) { System.out.println("Output directory cleaned."); } sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory; vectorDirectory = outputDirectory + vectorDirectory; similarityMatrixDirectory = outputDirectory + similarityMatrixDirectory; filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory; renamedInputdirectory = outputDirectory + renamedInputdirectory; // 1: Renames files 1-N System.out.println(); System.out.println("--------------"); System.out.println("Renaming Files"); System.out.println("--------------"); System.out.println("\tInput: " + inputDirectory); System.out.println("\tOutput: " + renamedInputdirectory); System.out.println(); String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f", filenameDictionaryDirectory }; ToolRunner.run(new RenameFilesJob(), arguments_renameFiles); // 2: Converts text to sequence file System.out.println(); System.out.println("--------------------------------------"); System.out.println("Creating Sequence Files From Directory"); System.out.println("--------------------------------------"); System.out.println("\tInput: " + renamedInputdirectory); System.out.println("\tOutput: " + sequenceFilesDirectory); System.out.println(); String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o", sequenceFilesDirectory }; ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory); // 3: Creates vectors of text System.out.println(); System.out.println("----------------"); System.out.println("Creating Vectors"); System.out.println("----------------"); System.out.println("\tInput: " + sequenceFilesDirectory); System.out.println("\tOutput: " + vectorDirectory); System.out.println(); String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory, "-x", exclusionThreshold, "-md", minimumDocumentFrequency }; ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles); // 4: Create a similarity matrix. System.out.println(); System.out.println("--------------------------"); System.out.println("Creating Similarity Matrix"); System.out.println("--------------------------"); System.out.println("\tInput: " + vectorDirectory + "/" + tf_tfidf + "-vectors"); System.out.println("\tOutput: " + similarityMatrixDirectory); System.out.println("\tLevels: " + numLevels); System.out.println(); String[] arguments_CreateSimilaritySimilarityJob = { "-i", vectorDirectory + "/" + tf_tfidf + "-vectors", "-o", similarityMatrixDirectory, "-dm", distanceMetric, "-l", numLevels, "-smd", diagScale }; ToolRunner.run(new CreateSimilarityMatrixJob(), arguments_CreateSimilaritySimilarityJob); return 0; }