Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * This function assumes that the TFIDF vector of the document containing text is already
 * given. We simply build a tfidf-vector of the text out of the docVector. 
 * The purpose of doing this is to save the time computing the tf-idf value for words in
 * the same document.//from  w ww .j  av  a2 s . co  m
 * 
 * @param text
 * @param docVector
 * @return
 */
public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) {
    Map<String, Float> map = new HashMap<String, Float>();

    //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer).
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
    TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();

            if (docVector.containsKey(term))
                map.put(term, docVector.get(term));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    analyzer.close();

    return map;
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public boolean contain(String label) {
    try {/*from  w  w  w .  j  a  v  a  2 s  .  c  o m*/
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);
        // use the boolean query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.MUST);
        }

        ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label);
        searcher.search(bq, collector);
        boolean ret = collector.isExistQueryLabel();
        reader.close();
        return ret;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return false;
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) {
    TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>();
    if (query == null) {
        ret.add(new SimilarLabelQueryResult(null, 1));
        return ret;
    }/*  w ww  .jav a2 s  .  c  o  m*/
    try {
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);

        // get terms from query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        SynonymMap synMap = SynonymIndex.getSynonymMap();
        HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet);

        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.SHOULD);
            // expand using synonyms
            for (String syn : synMap.getSynonyms(s)) {
                stemer.setCurrent(syn);
                stemer.stem();
                syn = stemer.getCurrent();
                if (expandedQueryTermSet.add(syn)) {
                    term = new Term(LabelDocument.FIELD_LABEL, syn);
                    termQuery = new TermQuery(term);
                    bq.add(termQuery, Occur.SHOULD);
                }
            }
        }

        // search in the label index
        SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet,
                similarity);
        searcher.search(bq, collector);
        ret = collector.getQueryResult();
        searcher.close();
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java

License:Open Source License

/**
 * @param args/*  w w w.  j  a  v  a 2 s.  com*/
 */
public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of , the new TokenStream API";

    SemicolonAnalyzer analyzer = new SemicolonAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);

    stream.reset();

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt.term());
    }

    stream.end();
    stream.close();

}

From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java

License:Open Source License

/**
 * tokenize the given string, all the words are extracted, lowercased, all
 * the stop words are removed, and all the words are replaced with their
 * stem//ww  w  .j a v  a  2s . c o m
 * 
 * @param label
 * @return
 */
public static HashSet<String> snowballTokenize(String label) {
    HashSet<String> ret = new HashSet<String>();
    try {
        Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
                StandardAnalyzer.STOP_WORDS_SET);

        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(termAtt.term());
        }
        stream.end();
        stream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return ret;
}

From source file:com.aliasi.lingmed.medline.SearchableMedlineCodec.java

License:Lingpipe license

public static void main(String[] args) throws Exception {
    org.apache.lucene.store.RAMDirectory directory = new org.apache.lucene.store.RAMDirectory();

    // org.apache.lucene.analysis.SimpleAnalyzer analyzer 
    // = new org.apache.lucene.analysis.SimpleAnalyzer();
    // org.apache.lucene.analysis.KeywordAnalyzer analyzer 
    // = new org.apache.lucene.analysis.KeywordAnalyzer();
    MedlineCodec codec = new MedlineCodec();
    Analyzer analyzer = codec.getAnalyzer();

    org.apache.lucene.index.IndexWriterConfig iwConf = new org.apache.lucene.index.IndexWriterConfig(
            org.apache.lucene.util.Version.LUCENE_36, analyzer);
    iwConf.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    org.apache.lucene.index.IndexWriter indexWriter = new org.apache.lucene.index.IndexWriter(directory,
            iwConf);//from   ww w .  j av  a  2 s .c  om

    Document doc = new Document();
    doc.add(new Field(Fields.MESH_MINOR_FIELD, "abc", Field.Store.NO, Field.Index.ANALYZED));
    doc.add(new Field(Fields.MESH_MINOR_FIELD, " xyz efg", Field.Store.NO, Field.Index.ANALYZED));
    indexWriter.addDocument(doc);
    indexWriter.close();

    org.apache.lucene.index.IndexReader reader = org.apache.lucene.index.IndexReader.open(directory);
    org.apache.lucene.search.IndexSearcher searcher = new org.apache.lucene.search.IndexSearcher(reader);

    org.apache.lucene.queryParser.QueryParser qp = new org.apache.lucene.queryParser.QueryParser(
            org.apache.lucene.util.Version.LUCENE_36, "foo", analyzer);
    org.apache.lucene.search.Query query = qp.parse(Fields.MESH_MINOR_FIELD + ":efg");

    org.apache.lucene.search.TopDocs hits = searcher.search(query, 1000);
    System.out.println("hits.length()=" + hits.scoreDocs.length);

    org.apache.lucene.analysis.TokenStream ts = analyzer.tokenStream(Fields.MESH_MINOR_FIELD,
            new java.io.StringReader("abc xyz efg"));
    org.apache.lucene.analysis.tokenattributes.CharTermAttribute terms = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsets = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute positions = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);

    while (ts.incrementToken()) {
        int increment = positions.getPositionIncrement();
        int start = offsets.startOffset();
        int end = offsets.endOffset();
        String term = terms.toString();
        System.out.println("token=|" + term + "|" + " startOffset=" + start + " endOffset=" + end
                + " positionIncr=" + increment);
    }
}

From source file:com.billiger.solr.handler.component.QLTBComponent.java

License:Apache License

/**
 * Get analyzed version of the query string.
 *
 * This uses the analyzer for the configured FieldType for this
 * component to analyze and re-assemble the original query string.
 * If no queryFieldType is configured, the original query will be
 * returned.//  w  w  w. j  av  a  2s .  c  om
 *
 * This is used both in the prepare() stage of the component and
 * when reading the QLTB map data.
 */
String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(query));
    tokens.reset();
    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    tokens.end();
    tokens.close();
    return norm.toString();
}

From source file:com.chimpler.example.bayes.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from w ww  . j a va  2 s.c om
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java

License:Apache License

/**
 * //from  w  w w. j  a v a 2  s. com
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    System.out.println("> ----- countWords ------");

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        System.out.print(" " + s);
        words.add(s);
    }

    System.out.println("\n<");

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);//from w ww . j  a  v  a  2  s . c  o m
    }

}