Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java

License:Apache License

private void updateWeightHighlighting() {
    // remove previous highlighting
    removeHighLights(weightingHighLights);
    if (weightHighlightBox.isSelected()) {
        if (inputDataObjects.getTemplateVector() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Template vector file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }//from  w ww .j a  va  2 s.  c o  m
        if (inputDataObjects.getInputData() == null) {
            Logger.getLogger("at.tuwien.ifs.somtoolbox").severe(
                    "Input data file needed for displaying weights. Load from the File->Data files menu");
            weightHighlightBox.setSelected(false);
            return;
        }

        SOMLibTemplateVector tv = inputDataObjects.getTemplateVector();
        InputData data = inputDataObjects.getInputData();
        InputDatum input = data.getInputDatum(currentInput);

        double maxValue = data.getMaxValue();
        double minValue = data.getMinValue();
        double span = maxValue - minValue;

        // init paints
        Palette p = paletteSelectionPanel.getSelectedPalette();
        int paletteLength = p.getNumberOfColours();
        weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength];
        for (int i = 0; i < weightPaints.length; i++) {
            weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i));
        }

        String text = textPane.getText();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
        try {
            while (stream.incrementToken()) {
                TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class);
                if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(),
                        "<APOSTROPHE>")) {
                    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
                    String term = termAttribute.term();
                    if (tv.containsLabel(term)) {
                        int index = tv.getIndex(term);
                        double value = input.getVector().getQuick(index);
                        int colorIndex = (int) (paletteLength / 4d
                                + relativeValue(minValue, span, value) * paletteLength / 2d);
                        OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class);
                        offsetAttribute.startOffset();
                        Object tag = highlighter.addHighlight(offsetAttribute.startOffset(),
                                offsetAttribute.endOffset(), weightPaints[colorIndex]);
                        weightingHighLights.add(tag);
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (BadLocationException e) {
            e.printStackTrace();
        }
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java

License:Apache License

@Override
public void execute(Tuple input) {
    try {/*from   w  w w  .  jav  a 2  s .c o m*/
        String tweet = (String) input.getValueByField(StreamIDs.TWEET);
        Reader reader = new StringReader(tweet);

        LanguageIdentifier identifier = new LanguageIdentifier(tweet);
        NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance()
                .getAnalyzer(new Locale(identifier.getLanguage()));

        TokenStream tokenStream = analyzer.tokenStream("", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
        reader.close();
        tokenStream.close();

        for (String term : extractNames(tweet, analyzer.getStopwords())) {
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
    } catch (IOException ex) {
        logger.error(ex);
    }
}

From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {/*  ww w  . j  a  v a  2  s. c  o m*/
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                result.add(term);
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:br.bireme.ngrams.Tools.java

public static void showTokens(final Analyzer analyzer, final String fieldName, final String text)
        throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.reset();/*from w  w w  .  j av  a2s .  c  o  m*/
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        final String term = charTermAttribute.toString();

        System.out.println(term + " [" + startOffset + "," + endOffset + "]");
    }
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46);

    List<String> result = new ArrayList<>();

    try {//w w w  .  j  av  a2 s . co m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer();

    List<String> result = new ArrayList<>();

    try {/* w  ww.  j  a  v  a 2s .co m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java

public List<String> search(String field, String value) {
    try {/* ww w  .ja va  2s  . co  m*/
        long start = System.currentTimeMillis();
        TokenStream stream = analyzer.tokenStream(field, new StringReader(value));
        CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        String valor = "";
        while (stream.incrementToken()) {
            valor = valor + attr.toString() + ' ';
        }
        BooleanQuery bq = new BooleanQuery();
        BooleanQuery acronymBq = null;
        String query = "";
        BooleanQuery wrapBq = new BooleanQuery();
        String[] tokens = valor.split(" ");
        for (int i = 0; i < tokens.length; i++) {
            if (tokens.length >= 2) {
                acronymBq = new BooleanQuery();
                switch (i) {
                case 0:
                    acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                    bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    break;
                case 1:
                    acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT);
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT);
                    break;
                default:
                    break;
                }
            } else {
                if (tokens[i].length() > 3) {
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                } else {
                    bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                }
            }
        }

        stream.end();
        stream.close();
        // Aqui termina
        // Cria uma fuzzyquery, ela que far a busca de aproximao

        wrapBq.add(bq, BooleanClause.Occur.MUST);
        if (acronymBq != null) {
            //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query)
            wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT);
        }
        String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms";
        // Pegando os documentos encontrado na pesquisa
        start = System.currentTimeMillis();
        ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs;
        String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms";
        List<String> result = new ArrayList<String>();
        result.add(valor);
        if (hits.length > 0) {
            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = searcher.doc(hits[i].doc);
                result.add(hitDoc.get(field));
            }
        }
        result.add(queryTime);
        result.add(searchTime);
        return result;
    } catch (IOException ex) {
        Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:byrne.mitre.MitreQuery.java

License:Apache License

public void run() {

    try {//from w ww.ja  v  a 2  s .c  o m

        TokenStream tokenStream = analyzer.tokenStream("ngrams", new StringReader(entry.getFullName()));

        BooleanQuery bq = new BooleanQuery();
        while (tokenStream.incrementToken()) {
            Term t = new Term("ngrams", tokenStream.getAttribute(TermAttribute.class).term());
            bq.add(new TermQuery(t), BooleanClause.Occur.SHOULD);
        }

        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(bq, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        for (int i = 0; i < hits.length; ++i) {

            int docId = hits[i].doc;
            Document d = searcher.doc(docId);

            out.write(entry.getID() + "|" + d.get("id") + "|" + df.format(hits[i].score) + "\n");
        }
    } catch (IOException IOE) {
    }
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * Filter the string with StandardAnalyzer.
 * @param str//from ww w .  j  ava 2 s . c om
 * @param removeStopWords   Indicate if the stop words should be removed.
 * @return
 */
public static String processString(String str, boolean removeStopWords) {
    StringBuffer strBuf = new StringBuffer();

    try {
        Analyzer analyzer = null;
        if (removeStopWords)
            analyzer = new StandardAnalyzer(Version.LUCENE_34);
        else
            analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34);

        TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str));
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            strBuf.append(term + " ");
        }

        analyzer.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return strBuf.toString().trim();
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * This function assumes that the TFIDF vector of the document containing text is already
 * given. We simply build a tfidf-vector of the text out of the docVector. 
 * The purpose of doing this is to save the time computing the tf-idf value for words in
 * the same document.//  w  w  w . jav a2 s.  c  o m
 * 
 * @param text
 * @param docVector
 * @return
 */
public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) {
    Map<String, Float> map = new HashMap<String, Float>();

    //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer).
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
    TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();

            if (docVector.containsKey(term))
                map.put(term, docVector.get(term));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    analyzer.close();

    return map;
}