Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.fastcatsearch.plugin.analysis.RunAnalyzer.java

public static void main(String[] args) throws IOException {
    if (args.length != 3) {
        printUsage();// w  w w  . j a v a2  s  . co  m
        System.exit(0);
    }

    File pluginDir = new File(args[0]);
    String pluginClassName = args[1];
    String analyzerId = args[2];
    RunAnalyzer runAnalyzer = new RunAnalyzer(pluginDir, pluginClassName);
    AnalyzerPool analyzerPool = runAnalyzer.getAnalyzerPool(analyzerId);
    Analyzer analyzer = null;

    try {
        analyzer = analyzerPool.getFromPool();
        //? ? ? ?.

        Scanner sc = new Scanner(System.in);
        System.out.println("==================================");
        System.out.println(" Fastcat analyzer");
        System.out.println(" Enter 'quit' for exit program. ");
        System.out.println("==================================");
        System.out.print("Input String: ");
        while (sc.hasNextLine()) {
            String str = sc.nextLine();
            if (str.equalsIgnoreCase("quit")) {
                break;
            }
            try {
                char[] value = str.toCharArray();
                TokenStream tokenStream = analyzer.tokenStream("", new CharArrayReader(value),
                        new AnalyzerOption());
                tokenStream.reset();

                CharsRefTermAttribute termAttribute = null;
                if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) {
                    termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
                }
                SynonymAttribute synonymAttribute = null;
                if (tokenStream.hasAttribute(SynonymAttribute.class)) {
                    synonymAttribute = tokenStream.getAttribute(SynonymAttribute.class);
                }
                AdditionalTermAttribute additionalTermAttribute = null;
                if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) {
                    additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class);
                }

                StopwordAttribute stopwordAttribute = null;
                if (tokenStream.hasAttribute(StopwordAttribute.class)) {
                    stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class);
                }

                CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);

                while (tokenStream.incrementToken()) {
                    String word = "";
                    //? ??  CharsRefTermAttribute ? .
                    if (termAttribute != null) {
                        word = termAttribute.toString();
                    } else {
                        //CharsRefTermAttribute ?   ??  CharTermAttribute ?  ?.
                        word = charTermAttribute.toString();
                    }

                    // ?? .
                    if (stopwordAttribute.isStopword()) {
                        continue;
                    }

                    //
                    // ??  .
                    //
                    System.out.print(">> ");
                    System.out.println(word);

                    //   .
                    if (synonymAttribute != null) {
                        List synonyms = synonymAttribute.getSynonyms();
                        if (synonyms != null) {
                            for (Object synonymObj : synonyms) {
                                if (synonymObj instanceof CharVector) {
                                    CharVector synonym = (CharVector) synonymObj;
                                    System.out.print("S> ");
                                    System.out.println(synonym);
                                } else if (synonymObj instanceof List) {
                                    List synonymList = (List) synonymObj;
                                    for (Object synonym : synonymList) {
                                        System.out.print("S> ");
                                        System.out.println(synonym);
                                    }
                                }
                            }
                        }
                    }

                    //  .
                    // ??? ? ?  ?? ?, ??  .
                    if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) {
                        Iterator<String> termIter = additionalTermAttribute.iterateAdditionalTerms();
                        while (termIter.hasNext()) {
                            String token = termIter.next();
                            System.out.print("A> ");
                            System.out.println(word);
                        }
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
            System.out.print("Input String: ");
        }
    } finally {
        if (analyzer != null) {
            analyzerPool.releaseToPool(analyzer);
        }
    }
    System.out.print("Bye!");
}

From source file:org.gbif.namefinder.analysis.sciname.SciNameIterator.java

License:Apache License

public SciNameIterator(TokenStream tokens) {
    super();/*from   ww  w .j a  va  2  s  . co m*/
    this.tokens = tokens;
    termAtt = tokens.getAttribute(CharTermAttribute.class);
    sciNameAtt = tokens.getAttribute(SciNameAttribute.class);
    offsetAtt = tokens.getAttribute(OffsetAttribute.class);
    nextName();
}

From source file:org.genemania.completion.lucene.GeneCompletionProvider.java

License:Open Source License

public Long getNodeId(String symbol) {
    try {/* w w w.  j  a v a  2  s .c om*/
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term()));
        }
        tokens.end();
        tokens.close();

        final Set<Long> nodes = new HashSet<Long>();
        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int id) {
                try {
                    Document document = searcher.doc(id);
                    nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD)));
                } catch (IOException e) {
                    log(e);
                }
            }
        });
        if (nodes.size() > 0) {
            return nodes.iterator().next();
        }
    } catch (IOException e) {
        log(e);
    }
    return null;
}

From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java

License:Open Source License

public void classify(final String symbol, final IGeneClassificationHandler handler)
        throws ApplicationException {
    try {/* w  ww  . j  a va2  s .  co m*/
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term()));
        }
        tokens.end();
        tokens.close();

        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int doc) {
                try {
                    Document document = searcher.doc(doc);
                    long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID));
                    handler.handleClassification(symbol, organismId);
                } catch (IOException e) {
                    log(e);
                }
            }
        });
    } catch (IOException e) {
        throw new ApplicationException(e);
    }
}

From source file:org.genemania.data.normalizer.GeneCompletionProvider2.java

License:Open Source License

public void computeProposals(final CompletionConsumer consumer, String queryString) {
    try {//from  w  w  w  .  j a  va  2  s .co  m
        if (queryString.length() == 0) {
            return;
        }

        TokenStream stream = analyzer.tokenStream(LuceneGeneMediator.GENE_SYMBOL,
                new StringReader(queryString));
        TermAttribute term = stream.getAttribute(TermAttribute.class);
        if (!stream.incrementToken()) {
            return;
        }

        BooleanQuery query = new BooleanQuery();
        query.add(new TermQuery(new Term(LuceneMediator.GENE_ORGANISM_ID, String.valueOf(organism.getId()))),
                Occur.MUST);
        query.add(new PrefixQuery(new Term(LuceneMediator.GENE_SYMBOL, term.term())), Occur.MUST);
        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int id) {
                try {
                    Document document = searcher.doc(id);
                    consumer.consume(document.get(LuceneMediator.GENE_SYMBOL));
                } catch (IOException e) {
                    log(e);
                }
            }
        });
    } catch (IOException e) {
        log(e);
    } catch (TooManyClauses e) {
        consumer.tooManyCompletions();
    } finally {
        consumer.finish();
    }
}

From source file:org.genemania.mediator.lucene.LuceneMediator.java

License:Open Source License

protected PhraseQuery createPhraseQuery(String field, String phrase) throws IOException {
    TokenStream stream = analyze(phrase);
    stream.reset();/*  w  w  w  . java  2 s  .co  m*/
    PhraseQuery query = new PhraseQuery();
    while (stream.incrementToken()) {
        TermAttribute term = stream.getAttribute(TermAttribute.class);
        query.add(new Term(field, term.term()));
    }
    stream.end();
    stream.close();
    return query;
}

From source file:org.gridkit.coherence.search.lucene.CapturedTokenStream.java

License:Apache License

public void append(TokenStream ts, int positionGap, int offsetShift) throws IOException {
    PositionIncrementAttribute pi = null;
    pi = ts.getAttribute(PositionIncrementAttribute.class);
    OffsetAttribute off = null;/*w  w w. j  a  v a 2 s. c  o  m*/
    if (offsetShift != 0) {
        off = ts.getAttribute(OffsetAttribute.class);
    }
    ts.reset();
    while (ts.incrementToken()) {
        if (positionGap != 0) {
            pi.setPositionIncrement(positionGap);
            positionGap = 0;
        }
        if (off != null) {
            off.setOffset(offsetShift + off.startOffset(), offsetShift + off.endOffset());
        }
        tokens.add(ts.captureState());
        lastPos += pi.getPositionIncrement();
    }
}

From source file:org.gridkit.coherence.search.lucene.TokenStreamCheck.java

License:Apache License

@Test
public void analyze() throws IOException {

    WhitespaceAnalyzer wa = new WhitespaceAnalyzer(Version.LUCENE_42);
    wa.getOffsetGap("xxx");
    TokenStream ts = wa.tokenStream("test", new StringReader("red black tree"));
    ts.reset();//from   w w  w . j a va 2  s. com
    ts.incrementToken();
    ts.getAttribute(CharTermAttribute.class).buffer();

    CapturedTokenStream cts = new CapturedTokenStream(ts);
    cts.reset();
    cts.incrementToken();
    cts.getAttribute(CharTermAttribute.class).buffer();
}

From source file:org.hbasene.index.HBaseIndexWriter.java

License:Apache License

public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
    String docId = doc.get(this.primaryKeyField);
    if (docId == null) {
        throw new IllegalArgumentException(
                "Primary Key " + this.primaryKeyField + " not present in the document to be added ");
        // TODO: Special type of exception needed ?

    }// www  .ja  va 2 s  .c  om
    int position = 0;
    Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>();
    Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>();

    for (Fieldable field : doc.getFields()) {

        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            tokens.addAttribute(TermAttribute.class);
            tokens.addAttribute(PositionIncrementAttribute.class);

            // collect term frequencies per doc
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms
            while (tokens.incrementToken()) {
                String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term());

                List<Integer> pvec = termPositions.get(term);

                if (pvec == null) {
                    pvec = Lists.newArrayList();
                    termPositions.put(term, pvec);
                }

                position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1);
                pvec.add(++position);

            }
            tokens.close();

        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            String term = this.createColumnName(field.name(), field.stringValue());
            String key = term;
            termPositions.put(key, EMPTY_TERM_POSITIONS);

        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T'));

            fieldsToStore.put(field.name(), Bytes.add(prefix, value));
        }
    }
    indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore));
    termPositions.clear();
    fieldsToStore.clear();
}

From source file:org.hibernate.search.backend.spi.SingularTermDeletionQuery.java

License:LGPL

@Override
public Query toLuceneQuery(DocumentBuilderIndexedEntity documentBuilder) {
    AnalyzerReference analyzerReferenceForEntity = documentBuilder.getAnalyzerReference();
    String stringValue = documentBuilder.objectToString(fieldName, this.getValue(),
            new ContextualExceptionBridgeHelper());

    if (this.getType() == Type.STRING) {
        try {/*from w  w  w.j a v a  2s  .  com*/
            if (analyzerReferenceForEntity.is(RemoteAnalyzerReference.class)) {
                // no need to take into account the analyzer here as it will be dealt with remotely
                return new TermQuery(new Term(this.getFieldName(), stringValue));
            }

            ScopedLuceneAnalyzer analyzerForEntity = (ScopedLuceneAnalyzer) analyzerReferenceForEntity
                    .unwrap(LuceneAnalyzerReference.class).getAnalyzer();
            TokenStream tokenStream = analyzerForEntity.tokenStream(this.getFieldName(), stringValue);
            tokenStream.reset();
            try {
                BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
                while (tokenStream.incrementToken()) {
                    String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
                    booleanQueryBuilder.add(new TermQuery(new Term(this.getFieldName(), term)), Occur.FILTER);
                }
                return booleanQueryBuilder.build();
            } finally {
                tokenStream.close();
            }
        } catch (IOException e) {
            throw new AssertionFailure(
                    "No IOException can occur while using a TokenStream that is generated via String");
        }
    } else {
        FieldBridge fieldBridge = documentBuilder.getBridge(fieldName);
        if (NumericFieldUtils.isNumericFieldBridge(fieldBridge)) {
            return NumericFieldUtils.createExactMatchQuery(fieldName, this.getValue());
        } else {
            return new TermQuery(new Term(this.getFieldName(), stringValue));
        }
    }
}