Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private String getTerm(String field, String text, Analyzer analyzer) throws XPathException {
    String term = null;/*w  w  w. j  av a 2s  .c om*/
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        if (stream.incrementToken()) {
            term = termAttr.toString();
        }
        stream.end();
        stream.close();
        return term;
    } catch (IOException e) {
        throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e);
    }
}

From source file:org.exist.indexing.range.RangeIndexWorker.java

License:Open Source License

protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs)
        throws XPathException {
    final Analyzer analyzer = getAnalyzer(qname, field, docs);
    if (!isCaseSensitive(qname, field, docs)) {
        data = data.toLowerCase();/*w  ww. ja  va2 s .  c  o  m*/
    }
    if (analyzer == null) {
        return new BytesRef(data);
    }
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(data));
        TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
        BytesRef token = null;
        try {
            stream.reset();
            if (stream.incrementToken()) {
                termAttr.fillBytesRef();
                token = termAttr.getBytesRef();
            }
            stream.end();
        } finally {
            stream.close();
        }
        return token;
    } catch (IOException e) {
        throw new XPathException("Error analyzing the query string: " + e.getMessage(), e);
    }
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text./* ww  w  .j a va2 s . co m*/
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
            String termText = new String(term.buffer(), 0, term.length());
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
        if (LOG.isTraceEnabled()) {
            LOG.trace("An exception occurred: " + e.getMessage());
        }
    }

    return new TermPositionVector() {

        private String[] terms = termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from   ww w .j a v  a  2  s.c  o  m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
        String word = new String(term.buffer(), 0, term.length());
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.fao.geonet.kernel.search.LuceneSearcher.java

License:Open Source License

/**
 * Splits text into tokens using the Analyzer that is matched to the field.
 * @param field/*from w w  w .  j  a  v a2s.c om*/
 * @param requestStr
 * @param a
 * @return
 */
private static String analyzeText(String field, String requestStr, PerFieldAnalyzerWrapper a) {

    boolean phrase = false;
    if ((requestStr.startsWith("\"") && requestStr.endsWith("\""))) {
        phrase = true;
    }

    TokenStream ts = a.tokenStream(field, new StringReader(requestStr));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);

    List<String> tokenList = new ArrayList<String>();
    try {
        while (ts.incrementToken()) {
            tokenList.add(termAtt.term());
        }
    } catch (Exception e) {
        // TODO why swallow
        e.printStackTrace();
    }

    StringBuilder result = new StringBuilder();

    for (int i = 0; i < tokenList.size(); i++) {
        if (i > 0) {
            result.append(" ");
            result.append(tokenList.get(i));
        } else {
            result.append(tokenList.get(i));
        }
    }
    String outStr = result.toString();
    if (phrase) {
        outStr = "\"" + outStr + "\"";
    }
    return outStr;
}

From source file:org.fastcatsearch.ir.index.SearchIndexWriter.java

License:Apache License

private void indexValue(int docNo, int i, Object value, boolean isIgnoreCase, int positionIncrementGap)
        throws IOException, IRException {
    if (value == null) {
        return;/*from w w  w  .  ja  v a  2 s  .c  o  m*/
    }
    char[] fieldValue = value.toString().toCharArray();
    TokenStream tokenStream = indexAnalyzerList[i].tokenStream(indexId, new CharArrayReader(fieldValue),
            indexingAnalyzerOption);
    tokenStream.reset();
    CharsRefTermAttribute termAttribute = null;
    PositionIncrementAttribute positionAttribute = null;
    StopwordAttribute stopwordAttribute = null;
    AdditionalTermAttribute additionalTermAttribute = null;
    CharTermAttribute charTermAttribute = null;
    //? ?  .

    if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) {
        termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
    }
    if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) {
        positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
    }
    if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) {
        additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class);
    }

    // stopword .
    if (tokenStream.hasAttribute(StopwordAttribute.class)) {
        stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class);
    }
    if (tokenStream.hasAttribute(CharTermAttribute.class)) {
        charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    }

    int lastPosition = 0;

    while (tokenStream.incrementToken()) {
        CharVector key = null;
        if (termAttribute != null) {
            CharsRef charRef = termAttribute.charsRef();
            char[] buffer = new char[charRef.length()];
            System.arraycopy(charRef.chars, charRef.offset, buffer, 0, charRef.length);
            key = new CharVector(buffer, 0, buffer.length);
        } else {
            key = new CharVector(charTermAttribute.buffer(), 0, charTermAttribute.length());
        }

        int position = -1;
        if (positionAttribute != null) {
            position = positionAttribute.getPositionIncrement() + positionIncrementGap;
            lastPosition = position;
        }
        //         logger.debug("FIELD#{}: {} >> {} ({})", indexId, key, docNo, position);
        if (stopwordAttribute != null && stopwordAttribute.isStopword()) {
            //ignore
        } else {
            memoryPosting.add(key, docNo, position);
        }
        //         if(synonymAttribute != null) {
        //            CharVector[] synonym = synonymAttribute.getSynonym();
        //            if(synonym != null) {
        //               for(CharVector token : synonym) {
        //                  memoryPosting.add(token, docNo, position);
        //               }
        //            }
        //         }
        if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) {
            Iterator<String> iter = additionalTermAttribute.iterateAdditionalTerms();
            while (iter.hasNext()) {
                CharVector token = new CharVector(iter.next().toCharArray());
                memoryPosting.add(token, docNo, lastPosition);
            }
        }
    }
}

From source file:org.fastcatsearch.plugin.analysis.RunAnalyzer.java

public static void main(String[] args) throws IOException {
    if (args.length != 3) {
        printUsage();/*from w w  w  .ja  va2s .  c  o  m*/
        System.exit(0);
    }

    File pluginDir = new File(args[0]);
    String pluginClassName = args[1];
    String analyzerId = args[2];
    RunAnalyzer runAnalyzer = new RunAnalyzer(pluginDir, pluginClassName);
    AnalyzerPool analyzerPool = runAnalyzer.getAnalyzerPool(analyzerId);
    Analyzer analyzer = null;

    try {
        analyzer = analyzerPool.getFromPool();
        //? ? ? ?.

        Scanner sc = new Scanner(System.in);
        System.out.println("==================================");
        System.out.println(" Fastcat analyzer");
        System.out.println(" Enter 'quit' for exit program. ");
        System.out.println("==================================");
        System.out.print("Input String: ");
        while (sc.hasNextLine()) {
            String str = sc.nextLine();
            if (str.equalsIgnoreCase("quit")) {
                break;
            }
            try {
                char[] value = str.toCharArray();
                TokenStream tokenStream = analyzer.tokenStream("", new CharArrayReader(value),
                        new AnalyzerOption());
                tokenStream.reset();

                CharsRefTermAttribute termAttribute = null;
                if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) {
                    termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
                }
                SynonymAttribute synonymAttribute = null;
                if (tokenStream.hasAttribute(SynonymAttribute.class)) {
                    synonymAttribute = tokenStream.getAttribute(SynonymAttribute.class);
                }
                AdditionalTermAttribute additionalTermAttribute = null;
                if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) {
                    additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class);
                }

                StopwordAttribute stopwordAttribute = null;
                if (tokenStream.hasAttribute(StopwordAttribute.class)) {
                    stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class);
                }

                CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);

                while (tokenStream.incrementToken()) {
                    String word = "";
                    //? ??  CharsRefTermAttribute ? .
                    if (termAttribute != null) {
                        word = termAttribute.toString();
                    } else {
                        //CharsRefTermAttribute ?   ??  CharTermAttribute ?  ?.
                        word = charTermAttribute.toString();
                    }

                    // ?? .
                    if (stopwordAttribute.isStopword()) {
                        continue;
                    }

                    //
                    // ??  .
                    //
                    System.out.print(">> ");
                    System.out.println(word);

                    //   .
                    if (synonymAttribute != null) {
                        List synonyms = synonymAttribute.getSynonyms();
                        if (synonyms != null) {
                            for (Object synonymObj : synonyms) {
                                if (synonymObj instanceof CharVector) {
                                    CharVector synonym = (CharVector) synonymObj;
                                    System.out.print("S> ");
                                    System.out.println(synonym);
                                } else if (synonymObj instanceof List) {
                                    List synonymList = (List) synonymObj;
                                    for (Object synonym : synonymList) {
                                        System.out.print("S> ");
                                        System.out.println(synonym);
                                    }
                                }
                            }
                        }
                    }

                    //  .
                    // ??? ? ?  ?? ?, ??  .
                    if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) {
                        Iterator<String> termIter = additionalTermAttribute.iterateAdditionalTerms();
                        while (termIter.hasNext()) {
                            String token = termIter.next();
                            System.out.print("A> ");
                            System.out.println(word);
                        }
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            System.out.print("Input String: ");
        }
    } finally {
        if (analyzer != null) {
            analyzerPool.releaseToPool(analyzer);
        }
    }
    System.out.print("Bye!");
}

From source file:org.genemania.completion.lucene.GeneCompletionProvider.java

License:Open Source License

public Long getNodeId(String symbol) {
    try {//w  ww  . j  a va 2s.  c  om
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term()));
        }
        tokens.end();
        tokens.close();

        final Set<Long> nodes = new HashSet<Long>();
        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int id) {
                try {
                    Document document = searcher.doc(id);
                    nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD)));
                } catch (IOException e) {
                    log(e);
                }
            }
        });
        if (nodes.size() > 0) {
            return nodes.iterator().next();
        }
    } catch (IOException e) {
        log(e);
    }
    return null;
}

From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java

License:Open Source License

public void classify(final String symbol, final IGeneClassificationHandler handler)
        throws ApplicationException {
    try {//from w  w  w  .j  av  a2s. com
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term()));
        }
        tokens.end();
        tokens.close();

        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int doc) {
                try {
                    Document document = searcher.doc(doc);
                    long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID));
                    handler.handleClassification(symbol, organismId);
                } catch (IOException e) {
                    log(e);
                }
            }
        });
    } catch (IOException e) {
        throw new ApplicationException(e);
    }
}