List of usage examples for org.apache.mahout.text SequenceFilesFromDirectory SequenceFilesFromDirectory
SequenceFilesFromDirectory
From source file:org.plista.kornakapi.core.training.FromDirectoryVectorizer.java
License:Apache License
private void generateSequneceFiles() { List<String> argList = Lists.newLinkedList(); argList.add("-i"); argList.add(DocumentFilesPath.toString()); argList.add("-o"); argList.add(sequenceFilesPath.toString()); argList.add("-ow"); String[] args = argList.toArray(new String[argList.size()]); try {//from w w w .j a v a 2 s . co m ToolRunner.run(new SequenceFilesFromDirectory(), args); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.plista.kornakapi.core.training.FromFileVectorizer.java
License:Apache License
/** * Generates SequenceFile/*from www . ja v a 2s.co m*/ * @throws Exception */ private void generateSequneceFiles() throws Exception { List<String> argList = Lists.newLinkedList(); argList.add("-i"); argList.add(DocumentFilesPath.toString()); argList.add("-o"); argList.add(sequenceFilesPath.toString()); argList.add("-ow"); String[] args = argList.toArray(new String[argList.size()]); ToolRunner.run(new SequenceFilesFromDirectory(), args); }
From source file:root.benchmark.ReutersVectorizationJob.java
License:Apache License
/** * {@inheritDoc}//from ww w .j av a2 s . c o m */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); String workingDirectory = workingFS.getWorkingDirectory() + "/"; outputDirectory = workingDirectory + outputDirectory; Path inputDirectoryPath = new Path(inputDirectory); Path outputDirectoryPath = new Path(outputDirectory); if (!inputFS.exists(inputDirectoryPath)) { throw new Exception("Input directory not found."); } if (workingFS.delete(outputDirectoryPath, true)) { System.out.println("Output directory cleaned."); } sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory; vectorDirectory = outputDirectory + vectorDirectory; filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory; renamedInputdirectory = outputDirectory + renamedInputdirectory; // 1: Renames files 1-N System.out.println(); System.out.println("--------------"); System.out.println("Renaming Files"); System.out.println("--------------"); System.out.println("\tInput: " + inputDirectory); System.out.println("\tOutput: " + renamedInputdirectory); System.out.println(); String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f", filenameDictionaryDirectory }; ToolRunner.run(new RenameFilesJob(), arguments_renameFiles); // 2: Converts text to sequence file System.out.println(); System.out.println("--------------------------------------"); System.out.println("Creating Sequence Files From Directory"); System.out.println("--------------------------------------"); System.out.println("\tInput: " + renamedInputdirectory); System.out.println("\tOutput: " + sequenceFilesDirectory); System.out.println(); String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o", sequenceFilesDirectory }; ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory); // 3: Creates vectors of text System.out.println(); System.out.println("----------------"); System.out.println("Creating Vectors"); System.out.println("----------------"); System.out.println("\tInput: " + sequenceFilesDirectory); System.out.println("\tOutput: " + vectorDirectory); System.out.println(); String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory, "-x", exclusionThreshold, "-md", minimumDocumentFrequency }; ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles); return 0; }
From source file:root.input.reuters21578.VectorizationJob.java
License:Apache License
/** * {@inheritDoc}/*from ww w . j a v a 2s . c o m*/ */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); String workingDirectory = workingFS.getWorkingDirectory() + "/"; outputDirectory = workingDirectory + outputDirectory; Path inputDirectoryPath = new Path(inputDirectory); Path outputDirectoryPath = new Path(outputDirectory); if (!inputFS.exists(inputDirectoryPath)) { throw new Exception("Input directory not found."); } if (workingFS.delete(outputDirectoryPath, true)) { System.out.println("Output directory cleaned."); } sequenceFilesDirectory = outputDirectory + sequenceFilesDirectory; vectorDirectory = outputDirectory + vectorDirectory; similarityMatrixDirectory = outputDirectory + similarityMatrixDirectory; filenameDictionaryDirectory = outputDirectory + filenameDictionaryDirectory; renamedInputdirectory = outputDirectory + renamedInputdirectory; // 1: Renames files 1-N System.out.println(); System.out.println("--------------"); System.out.println("Renaming Files"); System.out.println("--------------"); System.out.println("\tInput: " + inputDirectory); System.out.println("\tOutput: " + renamedInputdirectory); System.out.println(); String[] arguments_renameFiles = { "-i", inputDirectory, "-o", renamedInputdirectory, "-f", filenameDictionaryDirectory }; ToolRunner.run(new RenameFilesJob(), arguments_renameFiles); // 2: Converts text to sequence file System.out.println(); System.out.println("--------------------------------------"); System.out.println("Creating Sequence Files From Directory"); System.out.println("--------------------------------------"); System.out.println("\tInput: " + renamedInputdirectory); System.out.println("\tOutput: " + sequenceFilesDirectory); System.out.println(); String[] arguments_SequenceFilesFromDirectory = { "-i", renamedInputdirectory, "-o", sequenceFilesDirectory }; ToolRunner.run(new SequenceFilesFromDirectory(), arguments_SequenceFilesFromDirectory); // 3: Creates vectors of text System.out.println(); System.out.println("----------------"); System.out.println("Creating Vectors"); System.out.println("----------------"); System.out.println("\tInput: " + sequenceFilesDirectory); System.out.println("\tOutput: " + vectorDirectory); System.out.println(); String[] arguments_SparseVectorsFromSequenceFiles = { "-i", sequenceFilesDirectory, "-o", vectorDirectory, "-x", exclusionThreshold, "-md", minimumDocumentFrequency }; ToolRunner.run(new SparseVectorsFromSequenceFiles(), arguments_SparseVectorsFromSequenceFiles); // 4: Create a similarity matrix. System.out.println(); System.out.println("--------------------------"); System.out.println("Creating Similarity Matrix"); System.out.println("--------------------------"); System.out.println("\tInput: " + vectorDirectory + "/" + tf_tfidf + "-vectors"); System.out.println("\tOutput: " + similarityMatrixDirectory); System.out.println("\tLevels: " + numLevels); System.out.println(); String[] arguments_CreateSimilaritySimilarityJob = { "-i", vectorDirectory + "/" + tf_tfidf + "-vectors", "-o", similarityMatrixDirectory, "-dm", distanceMetric, "-l", numLevels, "-smd", diagScale }; ToolRunner.run(new CreateSimilarityMatrixJob(), arguments_CreateSimilaritySimilarityJob); return 0; }