List of usage examples for org.apache.mahout.vectorizer DocumentProcessor ANALYZER_CLASS
String ANALYZER_CLASS
To view the source code for org.apache.mahout.vectorizer DocumentProcessor ANALYZER_CLASS.
Click Source Link
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); analyzer = ClassUtils.instantiateAs( context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()), Analyzer.class); }
From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); String analyzerClassName = context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS, StandardAnalyzer.class.getName()); try {//from w w w . j a va2s .com analyzer = AnalyzerUtils.createAnalyzer(analyzerClassName); } catch (ClassNotFoundException e) { throw new IOException("Unable to create analyzer: " + analyzerClassName, e); } }
From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 6) { printUsage();//from ww w. j a v a 2s.c om } String inputPath = args[0]; String outputPath = args[1]; int maxIdsPerSplit = Integer.valueOf(args[2]); String dataAPIConfClassName = args[3]; String analyzerClassName = args[4]; int maxIdsPerReq = Integer.valueOf(args[5]); logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - analyzerName: " + analyzerClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); // upload dictionary file to HDFS // FileSystem fs = FileSystem.get(getConf()); // Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile)); // BufferedWriter writer = new BufferedWriter( // new OutputStreamWriter(fs.create(dictionaryPath, true))); // BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile)); // String line = null; // while ((line = reader.readLine()) != null) { // writer.write(line + "\n"); // } // writer.close(); // Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely."); job.setJarByClass(DataCopyTokenizerJob.class); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set distributed cache // Path dictionaryPath = new Path(dictionaryFile); // DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration()); // set data api conf job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); job.setMapperClass(DataCopyTokenizerMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); analyzer = ClassUtils.instantiateAs( context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()), Analyzer.class); // Configuration conf = context.getConfiguration(); // URI[] localFiles = DistributedCache.getCacheFiles(conf); // if (localFiles == null || localFiles.length == 0) // throw new RuntimeException( // "Cannot find paths from distribute cache."); ////from www . ja va2 s .co m // Path dictionaryFile = new Path(localFiles[0].getPath()); // FileSystem fs = FileSystem.get(conf); // BufferedReader reader = new BufferedReader(new InputStreamReader( // fs.open(dictionaryFile))); // String term = null; // while ((term = reader.readLine()) != null) { // dictionary.add(term.toLowerCase()); // } // reader.close(); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 7) { printUsage();/*ww w . j a v a 2 s .c om*/ } String inputPath = args[0]; String outputPath = args[1]; String dictPath = args[2]; int maxIdsPerSplit = Integer.valueOf(args[3]); String dataAPIConfClassName = args[4]; String analyzerClassName = args[5]; int maxIdsPerReq = Integer.valueOf(args[6]); logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - dictPath: " + dictPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - analyzerName: " + analyzerClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); // Job job = new Job(getConf(), "Create sparse vector from HTRC data storage."); job.setJarByClass(SparseVectorsFromRawText.class); // set dictionary job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set data api conf job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); Path output = new Path(outputPath); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, output); FileSystem.get(job.getConfiguration()).delete(output, true); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void setupConfiguration(Configuration conf) throws ClassNotFoundException, IOException { // set dictionary conf.set(HTRCConstants.DICTIONARY_PATH, dictDir); // set analyzer conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set data api conf conf.setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); // set memcached conf MemCachedUtil.configHelper(conf, memHostsPath); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void sequentialTransform() throws Exception { Configuration conf = getConf(); setupConfiguration(conf);/*from ww w. j av a2 s . co m*/ HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf); // set up analyzer, filter Analyzer analyzer = ClassUtils.instantiateAs( conf.get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()), Analyzer.class); HTRCFilter filter = new StopWordFilter("stopwords.txt"); // found in the // classpath Dictionary dictionary = new Dictionary(conf); filter.addNextFilter(new DictionaryFilter(dictionary)); filter.addNextFilter(new WordLengthFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_LENGTH, 2))); // memcached client ThreadedMemcachedClient memcachedClient = ThreadedMemcachedClient.getThreadedMemcachedClient(conf); MemcachedClient cache = memcachedClient.getCache(); int maxExpir = conf.getInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, -1); Transcoder<VectorWritable> transcoder = new HadoopWritableTranscoder<VectorWritable>(conf, VectorWritable.class); // Path input = new Path(idListDir); FileSystem fs = input.getFileSystem(conf); DataInputStream fsinput = new DataInputStream(fs.open(input)); BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput)); String line = null; int idNumThreshold = maxIdsPerReq; int idNum = 0; StringBuilder idList = new StringBuilder(); VectorWritable vectorWritable = new VectorWritable(); while ((line = reader.readLine()) != null) { idList.append(line + "|"); if ((++idNum) >= idNumThreshold) { // <id, content> Iterable<Entry<String, String>> content = client.getID2Content(idList.toString()); for (Entry<String, String> entry : content) { Vector result = transform2Vector(entry.getValue(), entry.getKey(), analyzer, filter, dictionary); vectorWritable.set(result); cache.set(entry.getKey(), maxExpir, vectorWritable, transcoder); // validate VectorWritable vecWritable = cache.get(entry.getKey(), transcoder); if (vecWritable == null) { throw new RuntimeException(entry.getKey() + " is not written to Memcached."); } else { System.out.println(entry.getKey()); } } idList = new StringBuilder(); idNum = 0; } } if (idList.length() > 0) { Iterable<Entry<String, String>> content = client.getID2Content(idList.toString()); for (Entry<String, String> entry : content) { Vector result = transform2Vector(entry.getValue(), entry.getKey(), analyzer, filter, dictionary); vectorWritable.set(result); cache.set(entry.getKey(), maxExpir, vectorWritable, transcoder); // validate VectorWritable vecWritable = cache.get(entry.getKey(), transcoder); if (vecWritable == null) { throw new RuntimeException(entry.getKey() + " is not written to Memcached."); } else { System.out.println(entry.getKey()); } } } }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();// ww w . ja v a 2s .c o m } String inputPath = args[0]; String outputPath = args[1]; String dictPath = args[2]; String analyzerClassName = args[3]; logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - dictPath: " + dictPath); logger.info(" - analyzerName: " + analyzerClassName); // Job job = new Job(getConf(), "Create sparse vector from HTRC data storage."); job.setJarByClass(SVFromHDFS2HDFS.class); // set dictionary job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); Path output = new Path(outputPath); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, output); FileSystem.get(job.getConfiguration()).delete(output, true); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java
License:Apache License
private void setupConfiguration(Configuration conf) throws ClassNotFoundException, IOException { // set dictionary conf.set(HTRCConstants.DICTIONARY_PATH, dictDir); // set analyzer conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set memcached conf MemCachedUtil.configHelper(conf, memHostsPath); }