Example usage for org.apache.mahout.vectorizer DocumentProcessor ANALYZER

Introduction

In this page you can find the example usage for org.apache.mahout.vectorizer DocumentProcessor ANALYZER_CLASS.

Prototype

String ANALYZER_CLASS

To view the source code for org.apache.mahout.vectorizer DocumentProcessor ANALYZER_CLASS.

Click Source Link

Usage

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    analyzer = ClassUtils.instantiateAs(
            context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()),
            Analyzer.class);
}

From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);

    String analyzerClassName = context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS,
            StandardAnalyzer.class.getName());
    try {//from   w  w  w .  j a va2s  .com
        analyzer = AnalyzerUtils.createAnalyzer(analyzerClassName);
    } catch (ClassNotFoundException e) {
        throw new IOException("Unable to create analyzer: " + analyzerClassName, e);
    }
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 6) {
        printUsage();//from ww  w. j a  v  a 2s.c  om
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    String analyzerClassName = args[4];
    int maxIdsPerReq = Integer.valueOf(args[5]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    // upload dictionary file to HDFS
    //      FileSystem fs = FileSystem.get(getConf());
    //      Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile));
    //      BufferedWriter writer = new BufferedWriter(
    //            new OutputStreamWriter(fs.create(dictionaryPath, true)));
    //      BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile));
    //      String line = null;
    //      while ((line = reader.readLine()) != null) {
    //         writer.write(line + "\n");
    //      }
    //      writer.close();

    // 
    Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely.");
    job.setJarByClass(DataCopyTokenizerJob.class);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set distributed cache
    //      Path dictionaryPath = new Path(dictionaryFile);
    //      DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration());

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);

    job.setMapperClass(DataCopyTokenizerMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);

    analyzer = ClassUtils.instantiateAs(
            context.getConfiguration().get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()),
            Analyzer.class);

    // Configuration conf = context.getConfiguration();
    // URI[] localFiles = DistributedCache.getCacheFiles(conf);
    // if (localFiles == null || localFiles.length == 0)
    // throw new RuntimeException(
    // "Cannot find paths from distribute cache.");
    ////from  www  .  ja va2  s .co m
    // Path dictionaryFile = new Path(localFiles[0].getPath());
    // FileSystem fs = FileSystem.get(conf);
    // BufferedReader reader = new BufferedReader(new InputStreamReader(
    // fs.open(dictionaryFile)));
    // String term = null;
    // while ((term = reader.readLine()) != null) {
    // dictionary.add(term.toLowerCase());
    // }
    // reader.close();
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 7) {
        printUsage();/*ww  w .  j  a v a  2 s  .c  om*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    int maxIdsPerSplit = Integer.valueOf(args[3]);
    String dataAPIConfClassName = args[4];
    String analyzerClassName = args[5];
    int maxIdsPerReq = Integer.valueOf(args[6]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SparseVectorsFromRawText.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private void setupConfiguration(Configuration conf) throws ClassNotFoundException, IOException {
    // set dictionary
    conf.set(HTRCConstants.DICTIONARY_PATH, dictDir);

    // set analyzer
    conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set data api conf
    conf.setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq);

    // set memcached conf
    MemCachedUtil.configHelper(conf, memHostsPath);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private void sequentialTransform() throws Exception {
    Configuration conf = getConf();
    setupConfiguration(conf);/*from ww  w. j av a2 s  .  co m*/

    HTRCDataAPIClient client = Utilities.creatDataAPIClient(conf);

    // set up analyzer, filter
    Analyzer analyzer = ClassUtils.instantiateAs(
            conf.get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()), Analyzer.class);
    HTRCFilter filter = new StopWordFilter("stopwords.txt"); // found in the
    // classpath
    Dictionary dictionary = new Dictionary(conf);
    filter.addNextFilter(new DictionaryFilter(dictionary));
    filter.addNextFilter(new WordLengthFilter(conf.getInt(HTRCConstants.FILTER_WORD_MIN_LENGTH, 2)));

    // memcached client
    ThreadedMemcachedClient memcachedClient = ThreadedMemcachedClient.getThreadedMemcachedClient(conf);
    MemcachedClient cache = memcachedClient.getCache();
    int maxExpir = conf.getInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, -1);
    Transcoder<VectorWritable> transcoder = new HadoopWritableTranscoder<VectorWritable>(conf,
            VectorWritable.class);

    //
    Path input = new Path(idListDir);
    FileSystem fs = input.getFileSystem(conf);
    DataInputStream fsinput = new DataInputStream(fs.open(input));
    BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput));
    String line = null;
    int idNumThreshold = maxIdsPerReq;
    int idNum = 0;
    StringBuilder idList = new StringBuilder();
    VectorWritable vectorWritable = new VectorWritable();
    while ((line = reader.readLine()) != null) {
        idList.append(line + "|");
        if ((++idNum) >= idNumThreshold) {
            // <id, content>
            Iterable<Entry<String, String>> content = client.getID2Content(idList.toString());
            for (Entry<String, String> entry : content) {
                Vector result = transform2Vector(entry.getValue(), entry.getKey(), analyzer, filter,
                        dictionary);
                vectorWritable.set(result);
                cache.set(entry.getKey(), maxExpir, vectorWritable, transcoder);

                // validate
                VectorWritable vecWritable = cache.get(entry.getKey(), transcoder);
                if (vecWritable == null) {
                    throw new RuntimeException(entry.getKey() + " is not written to Memcached.");
                } else {
                    System.out.println(entry.getKey());
                }
            }

            idList = new StringBuilder();
            idNum = 0;
        }
    }
    if (idList.length() > 0) {
        Iterable<Entry<String, String>> content = client.getID2Content(idList.toString());
        for (Entry<String, String> entry : content) {
            Vector result = transform2Vector(entry.getValue(), entry.getKey(), analyzer, filter, dictionary);
            vectorWritable.set(result);
            cache.set(entry.getKey(), maxExpir, vectorWritable, transcoder);

            // validate
            VectorWritable vecWritable = cache.get(entry.getKey(), transcoder);
            if (vecWritable == null) {
                throw new RuntimeException(entry.getKey() + " is not written to Memcached.");
            } else {
                System.out.println(entry.getKey());
            }
        }
    }
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();//  ww w  .  ja  v a 2s  .c o  m
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String dictPath = args[2];
    String analyzerClassName = args[3];

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - dictPath: " + dictPath);
    logger.info(" - analyzerName: " + analyzerClassName);

    //
    Job job = new Job(getConf(), "Create sparse vector from HTRC data storage.");
    job.setJarByClass(SVFromHDFS2HDFS.class);

    // set dictionary
    job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class);
    job.setNumReduceTasks(0);

    Path output = new Path(outputPath);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, output);
    FileSystem.get(job.getConfiguration()).delete(output, true);

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java

License:Apache License

private void setupConfiguration(Configuration conf) throws ClassNotFoundException, IOException {
    // set dictionary
    conf.set(HTRCConstants.DICTIONARY_PATH, dictDir);

    // set analyzer
    conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set memcached conf
    MemCachedUtil.configHelper(conf, memHostsPath);
}

Example usage for org.apache.mahout.vectorizer DocumentProcessor ANALYZER_CLASS

Introduction

Prototype

Usage